## Import Libraries

In [167]:
import os
import sys


## Read The PDF

In [168]:

data='data.pdf'


In [169]:
data

'data.pdf'

## Extract PDF data

In [170]:
import PyPDF2
from docx import Document

In [171]:
def extract_text_pdf(pdf):
    text = ""
    reader = PyPDF2.PdfReader(pdf)
    for page in reader.pages:
        text += page.extract_text() or ""
    return text


In [172]:
def extract_text_docx(docx):
    doc = Document(docx)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

In [173]:
def extract(doc):
    _, file_extension = doc.split('.')
    print(file_extension)
    if file_extension == 'pdf':
        return extract_text_pdf(doc)
    elif file_extension == 'docx':
        return extract_text_docx(doc)
    else:
        raise ValueError("Unsupported file format")

In [174]:
text_data=extract(data)

pdf


In [175]:
#print(text_data)   un comment and run this to view pdf in text fromat

## Divide text data into Chunks

In [176]:
# Code here
from langchain.text_splitter import CharacterTextSplitter

In [177]:
text_splitter=CharacterTextSplitter(
    separator="\n",
    chunk_size=1300,
    chunk_overlap=300,
    length_function=len,
)

''' important, GPT-3.5 has a maximum token limit of 4096 tokens per input sequence. This means that any 
 input longer than 4096 tokens would need to be split into multiple segments for processing. '''

' important, GPT-3.5 has a maximum token limit of 4096 tokens per input sequence. This means that any \n input longer than 4096 tokens would need to be split into multiple segments for processing. '

In [178]:
chunks=text_splitter.split_text(text_data)

In [179]:
len(chunks)

61

In [180]:
print(chunks[60])

(a) A minimum  GPA  of 3.30  for Honours  Degree  Course  Units,  
(b) A minimum  GPA  of 3.30  for all course  units,  and 
(c) Grades  of A- or better  for Honours  Degree  Course  Units  aggregating  to a mini - 
mum of 12 credits  23  (iii) Second  Class  (Lower  Division)  
(a) A minimum  GPA  of 3.00  for Honours  Degree  Course  Units,  and 
(b) A minimum  GPA  of 3.00  for all course  units


## Text Embedding

In [181]:
import os
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())
api_key=os.environ.get("OPEN_API_KEY")
embedding=OpenAIEmbeddings(openai_api_key=api_key)

In [182]:
#db=FAISS.from_texts(texts=chunks,embedding=embedding)

In [183]:
if os.path.exists("vectorstore/index.pkl"):
    # Load the existing vector store
    db = FAISS.load_local("vectorstore", embedding,allow_dangerous_deserialization=True)
else:
    # Create a new vector store from the text chunks
    db = FAISS.from_texts(texts=chunks, embedding=embedding)
    # Save the newly created vector store locally
    db.save_local("vectorstore")

## Similarity Check

In [184]:
#query = "what are the Minimum Requirement for the Completion of the Bachelor of Computer Science Honours Degree ?"
query = "who is the prime minister of srilanka"
docs = db.similarity_search(query)

In [185]:
len(docs)

4

In [186]:
print(docs[0])

page_content='for a career in Computer Science and Information Technology, which is one of the major 
driving forces of the economic development of Sri Lanka . This degree programme will cover 
all aspects of Computer Science including modern computer languages and systems.  
 
1.1 Research  Areas  
• Text Mining and Text Classification Data Mining, Rule Extraction and Knowledge 
Representation Applications of Machine Learni ng Techniques  
• Parallel computing:  developing dynamic load balancing algorithms on homogeneous 
and heterogeneous clusters of workstations  
• Intelligent  Information  Retrieval:  Pattern  Recognition,  Fuzzy  clustering,  Data  Min- 
ing/ Web Mining, Conceptual Indexin g and Similarity Search in text data, Context  
Based Clustering  
• Conceptual  Modelling,  Process  Modelling  and Process  Patterns,  Formal  Specifica - 
tion of Processes, e -Commerce Standardization, Service Oriented Computing, Business 
Rule Modeling  
• Network  Monitor ing and Acquirin

In [187]:
docs = db.similarity_search_with_score(query)

In [188]:
docs[0]

(Document(page_content='for a career in Computer Science and Information Technology, which is one of the major \ndriving forces of the economic development of Sri Lanka . This degree programme will cover \nall aspects of Computer Science including modern computer languages and systems.  \n \n1.1 Research  Areas  \n• Text Mining and Text Classification Data Mining, Rule Extraction and Knowledge \nRepresentation Applications of Machine Learni ng Techniques  \n• Parallel computing:  developing dynamic load balancing algorithms on homogeneous \nand heterogeneous clusters of workstations  \n• Intelligent  Information  Retrieval:  Pattern  Recognition,  Fuzzy  clustering,  Data  Min- \ning/ Web Mining, Conceptual Indexin g and Similarity Search in text data, Context  \nBased Clustering  \n• Conceptual  Modelling,  Process  Modelling  and Process  Patterns,  Formal  Specifica - \ntion of Processes, e -Commerce Standardization, Service Oriented Computing, Business \nRule Modeling  \n• Network 

In [189]:
docs[1]

(Document(page_content='Lecturer  \n \n \n \n \n \n \n \nProbationary \nLecturer  Dr. W. A. Mohotti  \nB.Sc.  IT (Mo ratuwa, SL) \nM.Sc.  in IT (Moratuwa,  SL) \nPh.D.  (QUT,  Australia)  Data  Mining  and Machine  Learning,  \nText Clustering, Outlier Detection, and \nCluster Evolution, Social Media Analytics  \nDr. D. W. C. P. Kumari  \nB. Sc.  in Comp.  Sci. (UCSC, SL) \nM.Phil.  (UCSC, SL)  \nPhD (QUT, Australia) \n(On Leave)  \nMr. K. D. C. G. Kapugama \nBCS (Ruhuna, SL)  \nReading  for PhD  (Monash,  Australia)  \n(On Leave)  Information  Security,  Process  \nAnalytics, Mathematical \nModelling,  Event  log analysis \nData Structure & Algorithms \nEvolutionary Algorithms  \nData Mining, Text Mining  \nMr. P. D. T. Chathuranga  \nBCS (Ruhuna,  SL) Natural  Language  Processing,  \nSentiment  Analysis,  Text  Mining,  \nMachine  Learning  \nMr. L.L. Gihan  Chathuranga  \nB.Sc  (Sabaragamuwa,  S.L) Machine  Learning,  Artificial  Neural  \nNetworks,  Artificia l Intelligence,  \nDa

In [190]:
docs[2]

(Document(page_content='Rule Modeling  \n• Network  Monitor ing and Acquiring  and Managing  Information  \n• E-commerce information systems development, Model -drivers design, Goal, Business  \n& service Modelling  \n• Computational Geometry, Computer Graphics programming, Design and Analysis of \nAlgorithms, Graph Theory  \n• Computati onal Systems Biology, Bioinformatics, Modelling and Simulation, Neural \nComputing, stochastic modelling  \n• Embedded  Systems,  reconfigurable  computing,  Bioinformatics  \n• Knowledge Representation, Ontology, Semantic Web, Ontology Engineering, Mobile \nApplications  4  1.2 Head of the Department  \nDr. W.A.  Indika  \nB.Sc.  (Kelaniya,  S.L.),  M.Sc.  (Kelaniya,  S.L.),  PhD  (UCSC,  S.L.)  \n \n1.3 Members  of Academic  Staff  \n \n \nDesignation  Name  Specialization  \nSenior  \nLecturer  Mr. S. A. S. Lorensuhewa  \nB.Sc.  (Colombo, SL) \nM.Sc.  (Zhejiang,  China)  Comput er Applications,  Text  Mining  and \nText Classification Data Mining, \

In [191]:
docs[3]

(Document(page_content='Business Process Modeling and \nOntology Business Rule Modeling with  \nApplication  to Problems  in Healthcare  Domain  \nDr. S. M. Vidanagamachchi  \nB.Sc.  in Comp.  Sci. (UCSC,  SL) \nPh.D. in Comp.  Eng.  (P’deniya, S.L.)  Embedded  Systems,  \nReconfigurable  Computing,  \nMachine  Learning,  Bioinformatics  \nDr. P.N.  Hameed  \nB.Sc.  (Hons)  in Comp.  Sci. \n(P’deniya, S.L.)  \nPh.D. (Melbourne,  Australia)  Bioinformatics,  \nData mining and Machine Learninig, \nBiomedical Informatics,  \nImage  processing,  Computer  Vision  \nDr. M. K. S. Madushika  \nB.Sc.  in Engineering \n(Peradeniya, S.L.)  \nPh.D.  (QUT,  Australia)  Artificial  Intelligence,  Deep  \nLearning, Computer Vision, \nNeural Networks,  \nImage  Processing  \nMr. K.R. Wijeweera  \nB.Sc.  (P’deniya,  SL) \nM.Phil.  (P’deniya,  SL) Computational  Geometry  \nLecturer  Ms. M. A. L. Kalyani  \nB.Sc.  (Colombo,  SL) \nPh.L.  (Uppsala,  Sweden)  Static  and dynamic  Load  \nBalancing  algor

## Similarity Check by vector


In [192]:
embedding_vector = embedding.embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

for a career in Computer Science and Information Technology, which is one of the major 
driving forces of the economic development of Sri Lanka . This degree programme will cover 
all aspects of Computer Science including modern computer languages and systems.  
 
1.1 Research  Areas  
• Text Mining and Text Classification Data Mining, Rule Extraction and Knowledge 
Representation Applications of Machine Learni ng Techniques  
• Parallel computing:  developing dynamic load balancing algorithms on homogeneous 
and heterogeneous clusters of workstations  
• Intelligent  Information  Retrieval:  Pattern  Recognition,  Fuzzy  clustering,  Data  Min- 
ing/ Web Mining, Conceptual Indexin g and Similarity Search in text data, Context  
Based Clustering  
• Conceptual  Modelling,  Process  Modelling  and Process  Patterns,  Formal  Specifica - 
tion of Processes, e -Commerce Standardization, Service Oriented Computing, Business 
Rule Modeling  
• Network  Monitor ing and Acquiring  and Managin

In [193]:
most_similar_doc = docs[0]


## generate the answer

In [194]:
from langchain.llms import OpenAI
llm = OpenAI(api_key=api_key)

In [195]:
from langchain.chains.question_answering import load_qa_chain
chain=load_qa_chain(llm=llm,chain_type='stuff')

In [196]:
response= chain.run(input_documents=docs,question=query)

In [197]:
response

" I'm sorry, I don't know the answer to that question. This context is about a Computer Science and Information Technology degree program and its faculty members. It does not mention anything about the prime minister of Sri Lanka. "

In [198]:
print(response)

 I'm sorry, I don't know the answer to that question. This context is about a Computer Science and Information Technology degree program and its faculty members. It does not mention anything about the prime minister of Sri Lanka. 
