In [1]:
%pip install transformers huggingface_hub streamlit PyPDF2 langchain langchain-community langchain_chroma sentence-transformers

Collecting streamlit
  Downloading streamlit-1.35.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain
  Downloading langchain-0.2.5-py3-none-any.whl.metadata (7.0 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.5-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.1-py3-none-any.whl.metadata (1.3 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Collecting langchain-core<0.3.0,>=0.2.7 (from langchain)
  Downloading langchain_core-0.2.9-py3-none-any.whl.metadata (6.0 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchai

In [2]:
import os
import streamlit as st
from PyPDF2 import PdfReader
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

In [4]:
def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return  text

In [5]:
def get_text_chunks(text):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
  texts = text_splitter.split_text(text)
  return texts

In [6]:
def get_vector_store(text_chunks):
  vector_store = Chroma.from_texts(
      texts=text_chunks,
      embedding=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2'),
      persist_directory="./chroma_db"
  )

In [7]:
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
def get_conversational_chain():
  prompt_template = """
  Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
  provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
  Context:\n {context}?\n
  Question: \n{question}\n

  Answer:
  """

  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
  model_kwargs = {"max_new_tokens":5000}
  model_id = "google/gemma-2b"
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
  model = AutoModelForCausalLM.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
  model.to("cuda")
  pipe = pipeline(
      "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100,device = 0
  )
  hf = HuggingFacePipeline(pipeline=pipe)
  chain = load_qa_chain(hf, chain_type="stuff", prompt=prompt)
  return chain

2024-06-19 08:41:27.845723: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-19 08:41:27.845833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-19 08:41:27.959567: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
def user_input(user_question):
  new_db = Chroma(persist_directory="./chroma_db", embedding_function = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2'))
  docs = new_db.similarity_search(user_question,k=3)
  chain = get_conversational_chain()
  response = chain(
      {"input_documents":docs, "question": user_question}
      ,return_only_outputs=True)

  print(response)
  # st.write("Reply: ", response["output_text"])
  return response

In [9]:
# def main():
#   st.set_page_config("ChatBot with custom PDF")
#   st.header("Chat with PDF using Gemma-7b")

#   user_question = st.text_input("Ask a Question from the PDF Files")

#   if user_question:
#       user_input(user_question)

#   with st.sidebar:
#       st.title("Menu:")
#       pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
#       if st.button("Submit & Process"):
#           with st.spinner("Processing..."):
#               raw_text = get_pdf_text(pdf_docs)
#               text_chunks = get_text_chunks(raw_text)
#               get_vector_store(text_chunks)
#               st.success("Done")



# if __name__ == "__main__":
#   main()

In [10]:
pdf_files = ["/kaggle/input/pdf-helper/IITM BS Degree Programme - Student Handbook - Latest.pdf"]


In [11]:
raw_text = get_pdf_text(pdf_files)
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
res = user_input('what are the foundation course in iitm?')



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'output_text': '\n  Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in\n  provided context just say, "answer is not available in the context", don\'t provide the wrong answer\n\n\n  Context:\n 18\nCredit\nTransfer\nfrom\ncourses\nthat\nare\nconducted\non\nIITM\ncampus\n(**This\nwill\nbe\nin\neffect \nfrom\nJanuary\n2024**)\n19\n10.\nLearner\nLife\nCycle\n20\n11.\nDesign\nof\ncertificates\nfor\nthe\n4\nlevels\nof\nthe\nprogram\n2111.1\nMark\ntranscripts\n25\n12.\nAcademic\naspects\n25\n12.1\nAcademic\ncalendar\n25\n12.2\nCourse\nstructure\nsuitable\nfor\nonline\ndelivery\n27\n12.3\nEvaluation\nand\ngrading\nof\ncourses\nin\nevery\nterm\n27\n12.4\nPass\ncriteria\nfor\neach\ncourse\n28\n12.5\nRepeating\na\ncourse\n29\n12.5.1\nRegistering\nfor\nCourses:\nTerm\n2\nOnwards\n30\n12.5.2\nRepeating\na\nCourse\n-\nWA,\nWQ,\nU\nor\nI\ngrade\n30\n12.5.3\nRepeating\na\nCourse\nfor\nImprovement\n30\n12.5.4\nMake\nUp\n

In [16]:
print(res['output_text'])


  Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
  provided context just say, "answer is not available in the context", don't provide the wrong answer


  Context:
 18
Credit
Transfer
from
courses
that
are
conducted
on
IITM
campus
(**This
will
be
in
effect 
from
January
2024**)
19
10.
Learner
Life
Cycle
20
11.
Design
of
certificates
for
the
4
levels
of
the
program
2111.1
Mark
transcripts
25
12.
Academic
aspects
25
12.1
Academic
calendar
25
12.2
Course
structure
suitable
for
online
delivery
27
12.3
Evaluation
and
grading
of
courses
in
every
term
27
12.4
Pass
criteria
for
each
course
28
12.5
Repeating
a
course
29
12.5.1
Registering
for
Courses:
Term
2
Onwards
30
12.5.2
Repeating
a
Course
-
WA,
WQ,
U
or
I
grade
30
12.5.3
Repeating
a
Course
for
Improvement
30
12.5.4
Make
Up
Exams
30
12.6
Dropping
a
course
(This
is
not
applicable
for
the
students
who
are
continuing
after 
qualifier
in
the
same
term)
32
13

In [14]:
!streamlit run app.py

  pid, fd = os.forkpty()


Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py
