In [None]:
import docx
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from rank_bm25 import BM25Okapi

os.environ['TRANSFORMERS_OFFLINE'] = '1'


# read doc file
def split_doc_by_heading(doc):
    sections = []
    current_section = []  
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue  # skip the empty lines
        # judge whether it is a title 
        if para.style.name.startswith("Heading"):
            if current_section:
                sections.append("\n".join(current_section))
            current_section = [text]
        else:
            current_section.append(text)
    if current_section:
        sections.append("\n".join(current_section))
    return sections

doc = docx.Document("Advising FAQ (12-19-24 Update).docx")
sections = split_doc_by_heading(doc)

for idx, section in enumerate(sections, 1):
    print(f"Section {idx}:\n{section}\n{'-'*50}\n")


  from .autonotebook import tqdm as notebook_tqdm


Section 1:
Table of Contents
*****=need to be revised
This FAQ is meant to help students and their advisors understand academic policies in a clear and easily searchable manner. To search for answers either use the table of contents or use the search function (Ctrl+F or Command+F on most machines) to search for keywords such as “leave”, “incomplete”, “dean’s list”, etc. This page will not have every answer a student can come up with so for questions not answered here, or if there is a question you would like added to the list, please reach out to your advisor or to the Office of Undergraduate Advising at advising@dukekunshan.edu.cn.
--------------------------------------------------

Section 2:
General Advising:
--------------------------------------------------

Section 3:
How should I prepare for my advising appointment?
Part of your experience at DKU is developing a high-quality advisor-student relationship. Prior to meeting with your academic advisor, reflect/revise your short-term

In [3]:

# BM25 Preprocessing: Simple split (lower case and split by spaces)
def tokenize(text):
    return text.lower().split()

tokenized_sections = [tokenize(sec) for sec in sections]
bm25 = BM25Okapi(tokenized_sections)

# Load SentenceTransformer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate all the sections embeddings and normalize it for cosine similarity calculation
section_embeddings = embed_model.encode(sections, convert_to_tensor=True, show_progress_bar=True)
section_embeddings_norm = util.normalize_embeddings(section_embeddings)


Batches: 100%|██████████| 4/4 [00:00<00:00,  4.10it/s]


In [4]:

# define query
query = "what courses are required to take?"

# generate the query embedding and normalize it
query_embedding = embed_model.encode([query], convert_to_tensor=True)
query_embedding_norm = util.normalize_embeddings(query_embedding)


# Calculate SentenceTransformer cosine similarity
st_cosine_scores = util.cos_sim(query_embedding_norm, section_embeddings_norm)[0].cpu().numpy()

# for i, text in enumerate(sections, 1):
#     if "curriculum" in text:
#         print(f"text {i}: \n{text}")
#         print(st_cosine_scores[i])

# print(f"text {12}: \n{sections[12]}")
# print(st_cosine_scores[12])
# Calculate BM25 score
query_tokens = tokenize(query)
bm25_scores = bm25.get_scores(query_tokens)
# Normalize BM25 score (to [0,1])
if np.max(bm25_scores) > 0:
    bm25_scores_norm = bm25_scores / np.max(bm25_scores)
else:
    bm25_scores_norm = bm25_scores

# Mixed method：0.3 * BM25 + 0.7 * SentenceTransformer (Cosine similarity)
hybrid_scores = 0.1 * bm25_scores_norm + 0.9 * st_cosine_scores

# Apply dynamic threshold
threshold = 0
candidate_indices = np.where(hybrid_scores >= threshold)[0]


In [5]:

if len(candidate_indices) == 0:
    print("No result larger than the threshold.")
else:
    # Pick the top-10 of the total scores from the candidates that satisfy threshold
    candidate_scores = hybrid_scores[candidate_indices]
    sorted_idx = np.argsort(-candidate_scores)  # sorted from highest to lowest
    top_candidate_indices = candidate_indices[sorted_idx][:10]
    candidate_texts = [sections[i] for i in top_candidate_indices]

    # Use cross encoder to rearrange: load larger model to sort agai
    cross_encoder = CrossEncoder('ms-marco-MiniLM-L12-v2')
    cross_input = [[query, text] for text in candidate_texts]
    cross_scores = cross_encoder.predict(cross_input)
    # According to the cross score rearrange the order
    
    rerank_order = np.argsort(-cross_scores)
    top_reranked_texts = [candidate_texts[i] for i in rerank_order]

    k = 5
    # construct enhanced prompt
    final_text = "\n".join(top_reranked_texts[:k])
    # construct enhanced prompt
    def construct_prompt(query, final_text):
        prompt = (
            f"Background Information (ordered by importance, from most important to least important):\n{final_text}\n\n"
            f"Question:{query}\n\n"
            "Using all the information above, please generate a clear, comprehensive, and precise answer.\
            \nAnswer:"
        )
        return prompt

    enhanced_prompt = construct_prompt(query, final_text)
    print(enhanced_prompt)


Background Information (ordered by importance, from most important to least important):
What are the required language courses?
8 to 16 credits of language courses are required for all students. All students will be assigned to EAP (English for Academic Purposes) Track or CSL (Chinese as a Second Language) Track based on students’ secondary education’s language medium.
For the EAP track, students are required to take EAP 101A,101B, and EAP 102A, 102B. WOC is available for students to develop advanced English writing and oral communication skills.
For the CSL track, students without any previous Chinese training are required to take CHINESE 101,102, 201, and 202. Students who have previous Chinese knowledge and are placed in a higher level Chinese class still need to take 8 credits of Chinese classes starting with and including whatever level course they are placed into. That is to say, students who begin their Chinese study in CHINESE 101A are required to take Chinese courses until the

In [6]:
import ollama

response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": enhanced_prompt}])
print(response['message']['content'])


Here is a comprehensive list of the required language courses, distribution requirements, and other necessary courses for graduation:

**Required Language Courses:**

* For EAP (English for Academic Purposes) Track:
	+ EAP 101A
	+ EAP 101B
	+ EAP 102A
	+ EAP 102B
* For CSL (Chinese as a Second Language) Track:
	+ CHINESE 101, 102, and 201 for students without previous Chinese training
	+ Students who begin studying Chinese in CHINESE 201A or higher level courses are required to take 8 credits of Chinese classes starting with and including whatever level course they are placed into

**Distribution Requirements:**

* Take at least one 4-credit course in each of the three divisional areas:
	+ Arts and Humanities (AH)
	+ Natural and Applied Sciences (NAS)
	+ Social Sciences (SS)

**Additional Requirements:**

* Students from mainland China and HMT are required to take 4 credits of PE courses.
* A maximum of 2 credits for PE can be applied toward the minimum 136 credits required for a bache