In [6]:
import json
import pandas as pd
with open('database.json', 'r') as f_in:
    documents = json.load(f_in)


In [7]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [6]:
doc_idx = {d['url']: d for d in documents}

In [2]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [3]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "url": {"type": "text"},
            "date": {"type": "text"},
            "title": {"type": "text"},
            "info": {"type": "text"}
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/89 [00:00<?, ?it/s]

In [10]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title^2", "info"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [23]:
def chunk_data(raw_doc, chunk_size=4999, overlap=100):
    def chunk_content(content, chunk_size=4999, overlap=100):
        chunks = []
        start = 0
        while start < len(content):
            end = start + chunk_size
            chunk = content[start:end]
            chunks.append(chunk)
            start = end - overlap
        return chunks

    chunked_data = []
    for doc in raw_doc:
        content_chunks = chunk_content(doc['info'], chunk_size, overlap)
        for i, chunk in enumerate(content_chunks):
            chunked_data.append({
                'url': doc['url'],
                'date': doc['date'],
                'title': doc['title'],
                'info': chunk
            })
    
    return chunked_data

In [24]:
data_chunk = chunk_data(documents)

In [26]:
data_chunk[72]

{'url': 'https://nitw.ac.in/api/static/files/Note_to_publish_OpenElective_courses-allot_2024-7-30-11-19-2.pdf',
 'date': '2024-07-30',
 'title': 'Allotment of Open Elective Courses to B.Tech. IV Year Students_Reg | Allotment of Open Elective Courses to B.Tech. IV Year Students_Reg',
 'info': ' Karthikeya 60 21EEB0A29 Karan Mankani 61 21EEB0A17 Botla Shresta 62 21MEB0B21 Goni Mohan 63 21BTB0A12 Daga Sahaj Kumar 64 21EEB0B60 Uday Pratap Singh 65 21BTB0A19 Dishant Maithani 66 21MEB0B11 Dodda Sri Tej Deep Reddy 67 21CEB0B52 Syed Abdul Nadeem 68 21CSB0A39 Neerati Bhuvanesh 69 21MEB0B03 Akash Sharma 70 21ECB0B10 Challa Sivaram 71 21ECB0B28 Kosaraju Ajay Kumar 72 21CEB0B38 Potugalla Vivek 73 21CEB0A01 Adarsh Kumar 74 21MEB0B20 Donthi Aneesh Raj 75 21CEB0A10 Briansalcheang R Marak 76 21CEB0A43 Rathod Nirajkumar Kanhiram 77 21CEB0A16 Guntur Jayadeep 78 21ECB0A19 Guguloth Jayavardhan 79 21BTB0A06 Aditya Rajubhai Parmar 80 21MEB0B16 Jitender Patel \n\n3. Open Elective ME445 Alternative Sources of

In [None]:

for doc in tqdm(data_chunk):
    doc['header_vector'] = model.encode(doc['header'])
    doc['main_content_vector'] = model.encode(doc['main_content'])

In [None]:
for doc in tqdm(data_chunk):
    doc['header_vector'] = model.encode(doc['header'])
    doc['main_content_vector'] = model.encode(doc['main_content'])

In [None]:
for doc in tqdm(data_chunk):
    es_client.index(index=index_name_vector, document=doc)

## LLM SETUP

In [55]:
from openai import OpenAI
from google import genai
client_gemini = genai.Client()
client_openai = OpenAI(base_url="https://openrouter.ai/api/v1")

In [56]:
def llm_OpenAI(prompt):
    response = client_openai.chat.completions.create(
        model="deepseek/deepseek-chat-v3-0324:free",
        messages=[
            {"role": "user", "content": prompt }
        ]
    )
    
    return response.choices[0].message.content 

def llm_gemini(prompt):
    response = client_gemini.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt ,
)
    return response.text

In [30]:
def prompt_builder(query, search_results):       
        prompt_template = """You are an AI assistant designed to help students of NIT Warangal (NITW) by answering their questions accurately and responsibly.
        
        You are provided with CONTEXT retrieved from trusted NITW sources. Your job is to:
        
        1. Answer the user's question using ONLY the provided context.
        2. Do NOT use any outside or prior knowledge, even if you know the answer.
        3. If the answer is not present in the context, respond with:
        "I could not find a verified answer to that in the available information. Please refer to official sources or contact the relevant NITW department by visiting nitw.ac.in."
        4. ALWAYS cite the source(s) used using the url provided format.
        
        URL: {url}
        
        CONTEXT: {context}   
        
        INSTRUCTIONS:
        - Only use facts from the context.
        - Do not assume or generate information not backed by the context.
        - Make your response clear and concise.
        - At the end of your answer, include a reference to the source like:
        (SOURCE: {url})
        """.strip()
        
        
        context = ""
        sources = ""
        for doc in search_results:
            context = context + f"title: {doc['title']}\ninfo: {doc['info']}\n\n"
            sources += f"url: {doc['url']}\n"
            
        
        prompt = prompt_template.format(url=sources, context=context).strip()
        return prompt

In [57]:
def rag(query):

    search_results = elastic_search(query)
    prompt = prompt_builder(query, search_results)
    answer = llm_gemini(prompt)
    return answer

In [58]:
query = '''I am an upcoming Phd Student at NIT warangal, can you pl
ease let me know how to the registration and document uploading on SMILE ERP portal for Ph.D?'''

In [59]:
llm_response = rag(query)

In [61]:
llm_response

'To register and upload documents on the SMILE ERP portal, follow these steps:\n\n1.  Go to erp.nitw.ac.in/ext/adm/login.\n2.  Enter your Email, Mobile No, Full Name & Date of Birth (as provided during application).\n3.  Enter the Token and OTP sent to your registered email to create and confirm your password (Alphanumeric only).\n4.  Sign in using your Email ID and Password.\n5.  Select your concerned advertisement and click on "Apply," then select your program and put choice as “1”.\n6.  Verify your email using the Email Token and OTP.\n7.  After confirmation, click on “My Applications” then click on “View” to proceed.\n8.  Fill in all details by selecting each left panel (Personal, Education, Qualifying, Payment Details, Document upload and Freeze application).\n9.  Pay the Institute fee and Hostel fee (if applicable).\n10. Upload all the required documents.\n11. Freeze the application once each section is completed. After the application is freezed, check all sections must be chang

In [None]:
def evaluate_relevance(question: str, answer_llm: str) -> dict:
    prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

    # Format the prompt with the question and answer
    evaluation_prompt = prompt2_template.format(question=question, answer_llm=answer_llm)

    # Get the evaluation from the LLM
    evaluation_response = llm_haiku(evaluation_prompt)

    # Parse the JSON response
    try:
        evaluation_result = json.loads(evaluation_response)
    except json.JSONDecodeError:
        # If JSON parsing fails, return an error result
        return {
            "Relevance": "ERROR",
            "Explanation": "Failed to parse LLM response as JSON"
        }

    # Validate the structure of the parsed result
    if "Relevance" not in evaluation_result or "Explanation" not in evaluation_result:
        return {
            "Relevance": "ERROR",
            "Explanation": "LLM response does not contain expected fields"
        }

    # Validate the Relevance value
    if evaluation_result["Relevance"] not in ["NON_RELEVANT", "PARTLY_RELEVANT", "RELEVANT"]:
        evaluation_result["Relevance"] = "ERROR"
        evaluation_result["Explanation"] += " (Invalid Relevance value)"

    rouge = Rouge()
    scores = rouge.get_scores(answer_llm, question)
    # Add ROUGE scores to the evaluation result as plain text
    evaluation_result["ROUGE_1"] = scores[0]['rouge-1']['f']
    evaluation_result["ROUGE_2"] = scores[0]['rouge-2']['f']
    evaluation_result["ROUGE_3"] = scores[0]['rouge-l']['f']
    
    
    
    return evaluation_result
     


In [None]:
version = "text+simple"
relevance_list = []
for i in range(5,8):
    question = ground_truth[i]["question"]
    answer = rag(question, version)
    score = evaluate_relevance(question, answer)
    score["question"] = question
    score["answer"] = answer
    relevance_list.append(score)