In [6]:
import json
import pandas as pd
with open('../data/database.json', 'r') as f_in:
    documents = json.load(f_in)


In [32]:
from rouge import Rouge

In [28]:
df_ground_truth = pd.read_csv('../data/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [6]:
doc_idx = {d['url']: d for d in documents}

In [2]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [3]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "url": {"type": "text"},
            "date": {"type": "text"},
            "title": {"type": "text"},
            "info": {"type": "text"}
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [8]:
from tqdm.auto import tqdm

In [9]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/89 [00:00<?, ?it/s]

In [10]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title^2", "info"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [4]:
def chunk_data(raw_doc, chunk_size=4999, overlap=100):
    def chunk_content(content, chunk_size=4999, overlap=100):
        chunks = []
        start = 0
        while start < len(content):
            end = start + chunk_size
            chunk = content[start:end]
            chunks.append(chunk)
            start = end - overlap
        return chunks

    chunked_data = []
    for doc in raw_doc:
        content_chunks = chunk_content(doc['info'], chunk_size, overlap)
        for i, chunk in enumerate(content_chunks):
            chunked_data.append({
                'url': doc['url'],
                'date': doc['date'],
                'title': doc['title'],
                'info': chunk
            })
    
    return chunked_data

In [7]:
data_chunk = chunk_data(documents)

In [26]:
data_chunk[72]

{'url': 'https://nitw.ac.in/api/static/files/Note_to_publish_OpenElective_courses-allot_2024-7-30-11-19-2.pdf',
 'date': '2024-07-30',
 'title': 'Allotment of Open Elective Courses to B.Tech. IV Year Students_Reg | Allotment of Open Elective Courses to B.Tech. IV Year Students_Reg',
 'info': ' Karthikeya 60 21EEB0A29 Karan Mankani 61 21EEB0A17 Botla Shresta 62 21MEB0B21 Goni Mohan 63 21BTB0A12 Daga Sahaj Kumar 64 21EEB0B60 Uday Pratap Singh 65 21BTB0A19 Dishant Maithani 66 21MEB0B11 Dodda Sri Tej Deep Reddy 67 21CEB0B52 Syed Abdul Nadeem 68 21CSB0A39 Neerati Bhuvanesh 69 21MEB0B03 Akash Sharma 70 21ECB0B10 Challa Sivaram 71 21ECB0B28 Kosaraju Ajay Kumar 72 21CEB0B38 Potugalla Vivek 73 21CEB0A01 Adarsh Kumar 74 21MEB0B20 Donthi Aneesh Raj 75 21CEB0A10 Briansalcheang R Marak 76 21CEB0A43 Rathod Nirajkumar Kanhiram 77 21CEB0A16 Guntur Jayadeep 78 21ECB0A19 Guguloth Jayavardhan 79 21BTB0A06 Aditya Rajubhai Parmar 80 21MEB0B16 Jitender Patel \n\n3. Open Elective ME445 Alternative Sources of

In [None]:

for doc in tqdm(data_chunk):
    doc['header_vector'] = model.encode(doc['header'])
    doc['main_content_vector'] = model.encode(doc['main_content'])

In [None]:
for doc in tqdm(data_chunk):
    doc['header_vector'] = model.encode(doc['header'])
    doc['main_content_vector'] = model.encode(doc['main_content'])

In [9]:
for doc in tqdm(data_chunk):
    es_client.index(index=index_name_vr, document=doc)

  0%|          | 0/184 [00:00<?, ?it/s]

NameError: name 'es_client' is not defined

## LLM SETUP

In [12]:
from openai import OpenAI
from google import genai
client_gemini = genai.Client()
client_openai = OpenAI(base_url="https://openrouter.ai/api/v1")

In [26]:
def llm_openai(prompt):
    response = client_openai.chat.completions.create(
        model="mistralai/mixtral-8x7b-instruct",
        messages=[
            {"role": "user", "content": prompt }
        ]
    )
    
    return response.choices[0].message.content 

def llm_gemini(prompt):
    response = client_gemini.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt ,
)
    return response.text

In [49]:
def prompt_builder(query, search_results):       
        prompt_template = """You are an AI assistant designed to help students of NIT Warangal (NITW) by answering their questions accurately and responsibly.

You are provided with CONTEXT retrieved from trusted NITW sources. Your job is to:

1.  Base your answer *primarily* on the provided CONTEXT. Synthesize information from the context to address the user's query as accurately as possible.
2.  Do NOT use any outside or prior knowledge. Your response must be derived *solely* from the provided context.
3.  **Handling Insufficient or Loosely Related Context:**
    *   If the context does not contain a direct or complete answer to the query, do *not* invent information.
    *   Instead, summarize the most relevant information found in the context related to the query.
    *   If the context is only loosely related or minimal, acknowledge the query and provide the relevant context snippets or simply list the source URL(s) as the best available information based on the text provided.
    *   Do *not* use the phrase "I could not find a verified answer to that in the available information."
4.  **Crucially:** Do NOT hallucinate. Only state facts or information that are explicitly mentioned or clearly inferable *from the provided context*.
5.  ALWAYS cite the source(s) used by including the URL(s) at the end of your response.

Now, answer the following question:{query}
URL: {url}

CONTEXT: {context}

INSTRUCTIONS:
- Only use facts and information derived *strictly* from the context.
- Do not assume, generate, or state information not backed by the context.
- If a direct answer isn't possible from the context, provide relevant summaries or snippets from the context instead.
- Make your response clear and concise.
- At the end of your answer, include a reference to the source like:
(SOURCE: {url})
""".strip()
        
        
        context = ""
        sources = ""
        for doc in search_results:
            context = context + f"title: {doc['title']}\ninfo: {doc['info']}\n\n"
            sources += f"url: {doc['url']}\n"
            
        
        prompt = prompt_template.format(url=sources, context=context , query=query).strip()
        return prompt

In [50]:
def rag(query):

    search_results = search(query)
    prompt = prompt_builder(query, search_results)
    answer = llm_gemini(prompt)
    return answer

In [51]:
query = 'With the revised guidelines for tuition fee remission, how will the institute ensure equitable access to these benefits for students from diverse socio-economic backgrounds, especially considering potential variations in documentation and verification processes across different regions?'

In [52]:
llm_response = rag(query)

In [53]:
llm_response

'I am unable to answer the question about how NIT Warangal will ensure equitable access to tuition fee remission benefits for students from diverse socio-economic backgrounds. The provided documents focus on tuition fee payment procedures in ERP and guidelines for various NSP scholarships, but do not contain information about revised guidelines for tuition fee remission and ensuring equitable access for students from diverse socio-economic backgrounds.\n\nHowever, the following information regarding NSP scholarships at NIT Warangal may be relevant:\n\n*   **NSP Scholarship Application Details**:\n    *   NIT Warangal has published the "NATIONAL SCHOLARSHIP PORTAL (NSP) – 2024-25” notification.\n    *   The last date for online application is 31.10.2024 and for submission of soft copies through email id acad_nspscholarships@nitw.ac.in and hard copies physically at Academic Section, and NIT Warangal on or before 31.10.2024.\n*   **Verification Process**:\n    *   Verification of student 

In [30]:
def evaluate_relevance(question: str, answer_llm: str) -> dict:
    prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

    # Format the prompt with the question and answer
    evaluation_prompt = prompt2_template.format(question=question, answer_llm=answer_llm)

    # Get the evaluation from the LLM
    evaluation_response = llm_openai(evaluation_prompt)

    # Parse the JSON response
    try:
        evaluation_result = json.loads(evaluation_response)
    except json.JSONDecodeError:
        # If JSON parsing fails, return an error result
        return {
            "Relevance": "ERROR",
            "Explanation": "Failed to parse LLM response as JSON"
        }

    # Validate the structure of the parsed result
    if "Relevance" not in evaluation_result or "Explanation" not in evaluation_result:
        return {
            "Relevance": "ERROR",
            "Explanation": "LLM response does not contain expected fields"
        }

    # Validate the Relevance value
    if evaluation_result["Relevance"] not in ["NON_RELEVANT", "PARTLY_RELEVANT", "RELEVANT"]:
        evaluation_result["Relevance"] = "ERROR"
        evaluation_result["Explanation"] += " (Invalid Relevance value)"

    rouge = Rouge()
    scores = rouge.get_scores(answer_llm, question)
    # Add ROUGE scores to the evaluation result as plain text
    evaluation_result["ROUGE_1"] = scores[0]['rouge-1']['f']
    evaluation_result["ROUGE_2"] = scores[0]['rouge-2']['f']
    evaluation_result["ROUGE_3"] = scores[0]['rouge-l']['f']
    
    
    
    return evaluation_result
     


In [46]:
relevance_list = []

In [54]:

for i in range(15, 20):
    question = ground_truth[i]["question"]
    answer = rag(question)
    score = evaluate_relevance(question, answer)
    score["question"] = question
    score["answer"] = answer
    relevance_list.append(score)

In [55]:
relevance_list

[{'Relevance': 'NON_RELEVANT',
  'Explanation': 'The generated answer does not provide any information or details related to the question about the revised guidelines for tuition fee remission, ensuring equitable access for students from diverse socio-economic backgrounds, or potential variations in documentation and verification processes across different regions.',
  'ROUGE_1': 0.09836065080354768,
  'ROUGE_2': 0.0,
  'ROUGE_3': 0.06557376555764616,
  'question': 'With the revised guidelines for tuition fee remission, how will the institute ensure equitable access to these benefits for students from diverse socio-economic backgrounds, especially considering potential variations in documentation and verification processes across different regions?',
  'answer': 'I could not find a verified answer to that in the available information. Please refer to official sources or contact the relevant NITW department by visiting nitw.ac.in.\n'},
 {'Relevance': 'NON_RELEVANT',
  'Explanation': 'Th

In [None]:
Index = minsearch.Index(
    text_fields = [ "title" ,  "info" ],
    keyword_fields= []
)

In [1]:
import minsearch

In [2]:
Index = minsearch.Index(
    text_fields = [ "title" ,  "info" ],
    keyword_fields= []
)

In [9]:
Index.fit(data_chunk)

<minsearch.Index at 0x7f134cdfef50>

In [10]:
 def search (query):
        boost = {"title": 1 , 'info': 1}
        results = Index.search(
            query=query,
            boost_dict= boost,
            num_results=5
        )

        return results

In [21]:
import json

with open("../data/chuncked_database.json", "w", encoding="utf-8") as f:
    json.dump(data_chunk, f, ensure_ascii=False, indent=4)

In [38]:
search('Considering the increasing cost of living and other academic expenses beyond tuition fees, will there be any accompanying suppor/t mechanisms, such as enhanced scholarship programs or subsidized resources, to further alleviate the financial burden on students who qualify for tuition fee remission?')

[{'url': 'https://nitw.ac.in/api/static/files/Tuition_Fee_Payment_Procedure_in_ERP_(except_I_-_Year)_2024-7-15-12-15-23.pdf',
  'date': '2024-07-15',
  'title': 'Tuition Fee Payment Procedure in ERP (except I - Year) | Tuition Fee Payment Procedure in ERP (except I - Year)',
  'info': 'Pay Tuition FEE payment procedure on ERP \n\n1. After login to your ERP click on the Accounts. \n\n2. Click on Student Academic Fee Details \n\n3. Click on the button to proceed. \n\n4. One pop up will appear. Click on \n\nProceed 5. It will redirect you to a new page where you will find the button as shown below. Click on it. \n\n6. The payment options will appear as shown below. You may select any one of the options and complete your payment. \n\n7. After successful payment you will be redirected to your home page and you will get the payment receipt. \n\nSTUDENTS THOSE WHO ARE PAYING THE TUITION FEES THROUGH BANK LOAN. • Select the “Bank Transfer” option for payment and click “PAY”. \n\n• After select