In [1]:
import docx
import os
import numpy as np
import re
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from rank_bm25 import BM25Okapi

os.environ['TRANSFORMERS_OFFLINE'] = '1'


def split_doc_by_heading_with_title(doc):
    sections = []
    current_section = None
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue 
        # judge whether it is a title 
        if para.style.name.lower().startswith("heading"):
            if current_section is not None:
                sections.append(current_section)
            current_section = {"title": text, "content": ""}
        else:
            if current_section is None:
                # if without title, then create a new section
                current_section = {"title": "Intro", "content": ""}
            # add in contents
            if current_section["content"]:
                current_section["content"] += "\n" + text
            else:
                current_section["content"] = text
    if current_section is not None:
        sections.append(current_section)
    return sections

doc = docx.Document("Advising FAQ (12-19-24 Update).docx")
sections_with_title = split_doc_by_heading_with_title(doc)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def merge_sections_by_title(sections, model, threshold=0.8):
    """
    concatenate each section in the list (including title and content),
    If the titles' cosine similarity is beyond the threshold, then combine then (title and content),
    and add title and content together into full_text to save.
    """
    if not sections:
        return []
    merged = []
    used = [False] * len(sections)
    titles = [sec["title"] for sec in sections]
    # contents = [sec["content"] for sec in sections]
    # Calculate all the embeddings of the titles
    title_embeddings = model.encode(titles, convert_to_tensor=True, show_progress_bar=False)
    # content_embeddings = model.encode(contents, convert_to_tensor=True, show_progress_bar=False)
    for i in range(len(sections)):
        if used[i]:
            continue
        merged_title = sections[i]["title"]
        merged_content = sections[i]["content"]
        used[i] = True
        for j in range(i+1, len(sections)):
            if used[j]:
                continue
            # title_score = util.cos_sim(title_embeddings[i], title_embeddings[j]).item()
            # content_score = util.cos_sim(content_embeddings[i], content_embeddings[j]).item()
            # sim = 0.5 * title_score + 0.5 * content_score
            sim = util.cos_sim(title_embeddings[i], title_embeddings[j]).item()
            if sim >= threshold:
                # merged_title += "\n" + sections[j]["title"]
                merged_content += "\n" + sections[j]["content"]
                used[j] = True
        # concatenate title and content into full_text
        full_text = merged_title + "\n" + merged_content
        merged.append({"title": merged_title, "content": merged_content, "full_text": full_text})
    return merged

# Calculate cosine similarity
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
merged_sections = merge_sections_by_title(sections_with_title, embed_model, threshold=0.6)


In [3]:
def tokenize(text):
    return text.lower().split()

full_text = [sec['full_text'] for sec in merged_sections]
# construct BM25 index for sections以合并后的 section 内容建立 BM25 索引
tokenized = [tokenize(text) for text in full_text]
bm25 = BM25Okapi(tokenized)


# generate the embeddings for sections
sections_embeddings = embed_model.encode(full_text, convert_to_tensor=True, show_progress_bar=True)
sections_embeddings_norm = util.normalize_embeddings(sections_embeddings)


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches: 100%|██████████| 3/3 [00:02<00:00,  1.26it/s]


In [4]:

# define the query
query = input("Please input your query: ")

# generate the embedding of the query
query_embedding = embed_model.encode([query], convert_to_tensor=True)
query_embedding_norm = util.normalize_embeddings(query_embedding)
# calculate cosine similarity
sections_cosine_scores = util.cos_sim(query_embedding_norm, sections_embeddings_norm)[0].cpu().numpy()

# calculate the BM25 score
query_tokens = tokenize(query)
bm25_scores = bm25.get_scores(query_tokens)
if np.max(bm25_scores) > 0:
    bm25_scores_norm = bm25_scores / np.max(bm25_scores)
else:
    bm25_scores_norm = bm25_scores

# mix the two scores up
sections_score = 0.2 * bm25_scores_norm + 0.8 * sections_cosine_scores
sections_weight = 0.5  # weight of section

# calculate cosine similarity of section titles and query
titles = [sec["title"] for sec in merged_sections]
title_embeddings = embed_model.encode(titles, convert_to_tensor=True, show_progress_bar=True)
query_title_embedding = embed_model.encode([query], convert_to_tensor=True)
title_cosine_scores = util.cos_sim(query_title_embedding, title_embeddings)[0].cpu().numpy()
title_weight = 0.5  # weight of the title

# Total score
overall_scores = sections_weight * sections_score + title_weight * title_cosine_scores



# rank by decreasing
candidate_indices = np.argsort(-overall_scores)

# select top-10
top_n = 10
top_candidate_indices = candidate_indices[:top_n]
candidate_texts = []
for idx in top_candidate_indices:
    # construct text (title and content)
    candidate_texts.append("Title: " + merged_sections[idx]["title"] + "\nContent: " + merged_sections[idx]["content"])


Batches: 100%|██████████| 3/3 [00:00<00:00, 27.63it/s]


In [5]:
# load cross encoder
cross_encoder = CrossEncoder('ms-marco-MiniLM-L12-v2')
cross_input = [[query, text] for text in candidate_texts]
cross_scores = cross_encoder.predict(cross_input)
rerank_order = np.argsort(-cross_scores)

k = 5
top_reranked_text = "\n".join([candidate_texts[i] for i in rerank_order[:k]])


In [6]:
def construct_prompt(query, top_reranked_text):
    prompt = (
        f"Background Information (ordered by importance, from most important to least important):\n{top_reranked_text}\n\n"
        f"Question:{query}\n\n"
        "Using all the information above, please generate a clear, comprehensive, and precise answer.\
        \nAnswer:"
    )
    return prompt

# enhanced_prompt = construct_prompt(query, top_reranked_text)
# print(enhanced_prompt)


In [7]:
import ollama

response = ollama.chat(model="llama3.2", 
                       messages=[{"role": "user", "content": enhanced_prompt}])
print(response['message']['content'])


NameError: name 'enhanced_prompt' is not defined