In [6]:
import numpy as np
import os
import torch
import PyPDF2
from typing import List
from transformers import AutoTokenizer,AutoModel,pipeline
import faiss

In [2]:
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
llm_name = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)
llm = pipeline('text-generation',model=llm_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNe

Initializing the vector database

In [7]:
embedding_dimensions = 384
index = faiss.IndexFlatL2(embedding_dimensions)
document_store = []

In [8]:
def create_embedding(text:str) -> np.ndarray:
    inputs = tokenizer(text,return_tensors='pt',truncation = True,padding=True)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [9]:
def load_pdfs(pdf_files: list[str]):
    global document_store
    for pdf_file in pdf_files:
        with open(pdf_file,'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + '\n'
            
            embedding = create_embedding(text)
            index.add(embedding)
            document_store.append(text)


In [10]:
def query_rag(query:str):
    query_embedding = create_embedding(query)
    D,I = index.search(query_embedding, k=4)
    results = [document_store[i] for i in I[0]]
    context = "\n\n".join(results)

    input_text = f"Context: {context}\n\nQuestion: {query}:"
    response = llm(input_text,max_length = 500, do_sample =False,truncation =True)
    return response[0]['generated_text']

Loading the data

In [32]:
folder_path = r'C:\\Users\\JAGRIT BHARATI\Desktop\\RAGify\\ragvenv\\PDFfiles'
pdf_files = []

In [37]:
import glob
pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
print(pdf_files)


['C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\2022 Batch  B.Tech Second Semester Examination-June 2024 (Supplementary).pdf', 'C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\2022 Batch-B.Tech First Semester Examination-June-2024(Supplementary).pdf', "C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\Circular on Guidelines for Mock Interview-Dos and Don'ts for Effective Preparation and Performance.pdf", 'C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\Circular on Hackathon.pdf', 'C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\Circular on Honeywell Hackathon 0n April 16th and 17th, 2024.pdf', 'C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\Circular-Important Dates (for School of CSE 2021, 2022 and 2023 Intake).pdf', 'C:\\\\Users\\\\JAGRIT BHARATI\\Desktop\\\\RAGify\\\\ragvenv\\\\PDFfiles\\Circular-Placement training, Dates, Guidelines (

In [38]:
def rag_chatbot():
    load_pdfs(pdf_files)

    print("RAG-based Chatbot initialized. Ask questions based on the documents.\n")
    
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            print("Chatbot: Goodbye!")
            break

        answer = query_rag(query)
        print(f"\nChatbot: {answer}\n")

In [39]:
rag_chatbot()

RAG-based Chatbot initialized. Ask questions based on the documents.



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Chatbot: Context: 1. Date: April 16th and 17th, 2024 School ot Computer Science and Engineering. Faculty of Engineering and Technology and the Fire And 
Combustion Research Center, in collaboration with Honeywell. will be hosting a 24-hour Hackathon on April 
l6th and l17th, 2024. This event is open to all engineering students across all years of study. 
Event Details: 
2. Venue: The Global Campus -JAIN (Deemed-to-be-University) 
Prizes: Circular on Honeywell Hackathon G)JAIN 
Event Deseription: The Hackathon will be centered around a set of problem statements provided by 
Honeywell, offering participants the opportunity to address specific industry challenges. During the event. 
participants will compete to generate ideas, concepts, and build models and prototypes in response to the 
problem statements. 
Mentorship and Support: The event will be graced by Fellows, Senior Technologists, and Engineers from 
Honeywell. who will be available to provide guidance, mentorship, and technical