In [1]:
%%capture --no-stderr
%pip install numpy==1.26.4
%pip install torch==2.2.1
%pip install transformers==4.47.0
%pip install scipy
%pip install pandas langchain langchain-community langchain-chroma
%pip install accelerate
%pip install tiktoken
%pip install unstructured python-docx pypdf
%pip install unstructured-inference

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
# Hangging Face Token for downloading meta Llama 2 7b model

HF_TOKEN = input("Enter your Hugging Face token: ")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
import torch
from langchain_chroma import Chroma
from langchain.text_splitter import TokenTextSplitter
import tiktoken
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import (CSVLoader,PyPDFLoader,UnstructuredWordDocumentLoader,UnstructuredExcelLoader)
from pathlib import Path
import re

In [5]:
def load_documents_from_folder(folder_path: str):
    docs = []
    folder = Path(folder_path)

    for file in folder.iterdir():
        if file.suffix == ".pdf":
            loader = PyPDFLoader(file_path=str(file))
        elif file.suffix == ".docx":
            loader = UnstructuredWordDocumentLoader(file_path=str(file))
        elif file.suffix == ".csv":
            loader = CSVLoader(file_path=str(file))
        elif file.suffix == ".xlsx":
            loader = UnstructuredExcelLoader(file_path=str(file))
        else:
            print(f"Unsupported file format: {file.name}")
            continue

        loaded_docs = loader.load()
        print(f"Loaded {len(loaded_docs)} docs from {file.name}")
        docs.extend(loaded_docs)

    return docs

In [6]:
def preprocess_text(text):

    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\\n', '', text)
    text = text.lower()

    return text

In [7]:
docs = load_documents_from_folder("/kaggle/input/assessment")
print(f"Total documents loaded: {len(docs)}")

Loaded 1 docs from M.Sc. Applied Psychology.docx
Loaded 136 docs from The-Alchemist.pdf
Loaded 16 docs from The_Plan_of_the_Giza_Pyramids.pdf
Loaded 10 docs from new-approaches-and-procedures-for-cancer-treatment.pdf
Loaded 1 docs from Loan analysis.xlsx
Loaded 1 docs from Stats.docx
Loaded 1 docs from party budget1.xlsx
Loaded 1 docs from Loan amortisation schedule1.xlsx
Loaded 52 docs from Ocean_ecogeochemistry_A_review.pdf
Loaded 1 docs from Dataset summaries and citations.docx
Total documents loaded: 220


In [8]:
print(f'Before preprocces"{docs[0].page_content[:200]}')

for doc in docs:
  preprocessed_text = preprocess_text(doc.page_content)
  doc.page_content = preprocessed_text
print("Preproccessing Finshied")

print(f'After preprocces"{docs[0].page_content[:200]}')

Before preprocces"M.sc., applied psychology SYLLABUS from the acadmic year 2023-2024 TAMILNADU STATE COUNCIL FOR HIGHER EDUCATION,  CHENNAI – 600 005

TANSCHE REGULATIONS ON LEARNING OUTCOMES-BASED CURRICULUM FRAMEWORK
Preproccessing Finshied
After preprocces"m.sc., applied psychology syllabus from the acadmic year 2023-2024 tamilnadu state council for higher education, chennai  600 005 tansche regulations on learning outcomes-based curriculum framework fo


In [9]:
token_splitter = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=200,
    chunk_overlap=30
)

chunked_documents = []

for doc in docs:
    chunks = token_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks, 1):
        chunked_doc = {
            "file_name": doc.metadata.get("source", "unknown"),
            "page_number": doc.metadata.get("page", "unknown"),
            "chunk_number": i,
            "text": chunk
        }
        chunked_documents.append(chunked_doc)

# Output for frist three docs
for item in chunked_documents[:2]:
    print(item)

{'file_name': '/kaggle/input/assessment/M.Sc. Applied Psychology.docx', 'page_number': 'unknown', 'chunk_number': 1, 'text': 'm.sc., applied psychology syllabus from the acadmic year 2023-2024 tamilnadu state council for higher education, chennai  600 005 tansche regulations on learning outcomes-based curriculum framework for postgraduate education programme m.sc., applied psychology programme code duration pg-2 years programme outcomes (pos) po1: problem solving skill apply knowledge of management theories and human resource practices to solve business problems through research in global context. po2: decision making skill foster analytical and critical thinking abilities for data-based decision-making. po3: ethical value ability to incorporate quality, ethical and legal value-based perspectives to all organizational activities. po4: communication skill ability to develop communication, managerial and interpersonal skills. po5: individual and team leadership skill capability to lead t

In [10]:
documents_with_metadata = [
    Document(page_content=item["text"], metadata={
        "source": item["file_name"],
        "page": item["page_number"],
        "chunk": item["chunk_number"]
    })
    for item in chunked_documents
]

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={"trust_remote_code": True,
                 "device": device}
)

persist_directory = "chroma_store"

vectorstore = Chroma.from_documents(
    documents=documents_with_metadata,
    embedding=embedding_model,
    persist_directory=persist_directory
)

  embedding_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/547M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

In [12]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    use_auth_token = HF_TOKEN
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1
)

llm = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [13]:
retrieve = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [14]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retrieve,
    memory=memory,
    return_source_documents=False
)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [15]:
def new_session():
    new_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa_chain.memory = new_memory
    print("\n🔄 New context started!")

# Chat loop
while True:
    query = input("You: ")
    if query.lower() == "reset":
        new_session()
        continue
    elif query.lower() in ["exit", "quit"]:
        break
    response = qa_chain.invoke({"question": query})
    print("Bot:", response["answer"])

You:  what is set cell therapy


Bot: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

, there are challenges, such as therapeutic dose control, low cell targeting, and retention in tumor sites, that should be investigated and over- come in the future. in addition, existing results from stem cell technologies are highly encouraging for tumor treatment but it still needs further efforts to improve the safety and effi- cacy before they could enter clinical trials. table 1 summa- rized the licensed list of stem cell therapies. targeted drug therapy targeted cancer therapies are drugs or other substances which are sometimes interchangeably used as molecularly targeted drugs, molecularly targeted therapies, and pre- cision medicines. those drugs mechanism of action is by interfering with growth molecules which leads to blocking the growth and spreading of cancer.34 tumor initiation and progression are determin

You:  what is autoimmunity


Bot: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

, there are challenges, such as therapeutic dose control, low cell targeting, and retention in tumor sites, that should be investigated and over- come in the future. in addition, existing results from stem cell technologies are highly encouraging for tumor treatment but it still needs further efforts to improve the safety and effi- cacy before they could enter clinical trials. table 1 summa- rized the licensed list of stem cell therapies. targeted drug therapy targeted cancer therapies are drugs or other substances which are sometimes interchangeably used as molecularly targeted drugs, molecularly targeted therapies, and pre- cision medicines. those drugs mechanism of action is by interfering with growth molecules which leads to blocking the growth and spreading of cancer.34 tumor initiation and progression are determin

You:  reset



🔄 New context started!


You:  what is loan amount in loan analysis sheet


Bot: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

loan analysis worksheet loan analysis rate slicer to filter table data based on rates is in this cell. interest rate 0.05 monthly payment 106.065515 years of loan 10 total payment 12727.861829 loan amount 10000 total interest 2727.861829 payments due end of period years rate 3 5 10.000000 12 15.000000 20.000000 25.000000 30.000000 0.02 286.425787 175.277601 92.013454 78.168369 64.350870 50.588334 42.385434 36.961947 0.0225 287.518475 176.373448 93.137372 79.305498 65.508477 51.780829 43.613070 38.224610 0.025 288.613757 177.473616

loan amortisation schedule enter values loan summary loan amount 5000 scheduled payment 425.749521 annual interest rate 0.04 scheduled number of payments 12 loan period in years 1 actual number of payments 10 number of payments per year 12 total early payments 900 start date of loan 2025-04-0

You:  what is total loan amount


Bot: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

loan analysis worksheet loan analysis rate slicer to filter table data based on rates is in this cell. interest rate 0.05 monthly payment 106.065515 years of loan 10 total payment 12727.861829 loan amount 10000 total interest 2727.861829 payments due end of period years rate 3 5 10.000000 12 15.000000 20.000000 25.000000 30.000000 0.02 286.425787 175.277601 92.013454 78.168369 64.350870 50.588334 42.385434 36.961947 0.0225 287.518475 176.373448 93.137372 79.305498 65.508477 51.780829 43.613070 38.224610 0.025 288.613757 177.473616

loan amortisation schedule enter values loan summary loan amount 5000 scheduled payment 425.749521 annual interest rate 0.04 scheduled number of payments 12 loan period in years 1 actual number of payments 10 number of payments per year 12 total early payments 900 start date of loan 2025-04-0

You:  what is total interst paid


Bot: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

loan analysis worksheet loan analysis rate slicer to filter table data based on rates is in this cell. interest rate 0.05 monthly payment 106.065515 years of loan 10 total payment 12727.861829 loan amount 10000 total interest 2727.861829 payments due end of period years rate 3 5 10.000000 12 15.000000 20.000000 25.000000 30.000000 0.02 286.425787 175.277601 92.013454 78.168369 64.350870 50.588334 42.385434 36.961947 0.0225 287.518475 176.373448 93.137372 79.305498 65.508477 51.780829 43.613070 38.224610 0.025 288.613757 177.473616

loan amortisation schedule enter values loan summary loan amount 5000 scheduled payment 425.749521 annual interest rate 0.04 scheduled number of payments 12 loan period in years 1 actual number of payments 10 number of payments per year 12 total early payments 900 start date of loan 2025-04-0

You:  exit
