## Import Libraries

In [231]:
import os
import sys


## Read The PDF

In [232]:

data='data.pdf'


In [233]:
data

'data.pdf'

## Extract PDF data

In [234]:
import PyPDF2
from docx import Document

In [235]:
def extract_text_pdf(pdf):
    text = ""
    reader = PyPDF2.PdfReader(pdf)
    for page in reader.pages:
        text += page.extract_text() or ""
    return text


In [236]:
def extract_text_docx(docx):
    doc = Document(docx)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

In [237]:
def extract(doc):
    _, file_extension = doc.split('.')
    print(file_extension)
    if file_extension == 'pdf':
        return extract_text_pdf(doc)
    elif file_extension == 'docx':
        return extract_text_docx(doc)
    else:
        raise ValueError("Unsupported file format")

In [238]:
text_data=extract(data)

pdf


In [239]:
#print(text_data)   un comment and run this to view pdf in text fromat

## Divide text data into Chunks

In [240]:
# Code here
from langchain.text_splitter import CharacterTextSplitter

In [241]:
text_splitter=CharacterTextSplitter(
    separator="\n",
    chunk_size=1300,
    chunk_overlap=300,
    length_function=len,
)

''' important, GPT-3.5 has a maximum token limit of 4096 tokens per input sequence. This means that any 
 input longer than 4096 tokens would need to be split into multiple segments for processing. '''

' important, GPT-3.5 has a maximum token limit of 4096 tokens per input sequence. This means that any \n input longer than 4096 tokens would need to be split into multiple segments for processing. '

In [242]:
chunks=text_splitter.split_text(text_data)

In [243]:
len(chunks)

61

In [244]:
print(chunks[60])

(a) A minimum  GPA  of 3.30  for Honours  Degree  Course  Units,  
(b) A minimum  GPA  of 3.30  for all course  units,  and 
(c) Grades  of A- or better  for Honours  Degree  Course  Units  aggregating  to a mini - 
mum of 12 credits  23  (iii) Second  Class  (Lower  Division)  
(a) A minimum  GPA  of 3.00  for Honours  Degree  Course  Units,  and 
(b) A minimum  GPA  of 3.00  for all course  units


## Text Embedding

In [245]:
import os
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())
api_key=os.environ.get("OPEN_API_KEY")
embedding=OpenAIEmbeddings(openai_api_key=api_key)

In [246]:
db=FAISS.from_texts(texts=chunks,embedding=embedding)

In [247]:
if os.path.exists("vectorstore/index.pkl"):
    # Load the existing vector store
    db = FAISS.load_local("vectorstore", embedding,allow_dangerous_deserialization=True)
else:
    # Create a new vector store from the text chunks
    db = FAISS.from_texts(texts=chunks, embedding=embedding)
    # Save the newly created vector store locally
    db.save_local("vectorstore")

## Similarity Check

In [409]:
#query = "what are the Minimum Requirement for the Completion of the Bachelor of Computer Science Honours Degree ?"
query = "who are the lecturers in computer science department with their designation and their qualifications"
docs = db.similarity_search(query)

In [410]:
len(docs)

4

In [411]:
print(docs[0])

page_content='Lecturer  
 
 
 
 
 
 
 
Probationary 
Lecturer  Dr. W. A. Mohotti  
B.Sc.  IT (Mo ratuwa, SL) 
M.Sc.  in IT (Moratuwa,  SL) 
Ph.D.  (QUT,  Australia)  Data  Mining  and Machine  Learning,  
Text Clustering, Outlier Detection, and 
Cluster Evolution, Social Media Analytics  
Dr. D. W. C. P. Kumari  
B. Sc.  in Comp.  Sci. (UCSC, SL) 
M.Phil.  (UCSC, SL)  
PhD (QUT, Australia) 
(On Leave)  
Mr. K. D. C. G. Kapugama 
BCS (Ruhuna, SL)  
Reading  for PhD  (Monash,  Australia)  
(On Leave)  Information  Security,  Process  
Analytics, Mathematical 
Modelling,  Event  log analysis 
Data Structure & Algorithms 
Evolutionary Algorithms  
Data Mining, Text Mining  
Mr. P. D. T. Chathuranga  
BCS (Ruhuna,  SL) Natural  Language  Processing,  
Sentiment  Analysis,  Text  Mining,  
Machine  Learning  
Mr. L.L. Gihan  Chathuranga  
B.Sc  (Sabaragamuwa,  S.L) Machine  Learning,  Artificial  Neural  
Networks,  Artificia l Intelligence,  
Data  mining  
Ms. H.D.  Supuni  Shashikala  
B.

In [412]:
docs = db.similarity_search_with_score(query)

In [413]:
docs[0]

(Document(page_content='Lecturer  \n \n \n \n \n \n \n \nProbationary \nLecturer  Dr. W. A. Mohotti  \nB.Sc.  IT (Mo ratuwa, SL) \nM.Sc.  in IT (Moratuwa,  SL) \nPh.D.  (QUT,  Australia)  Data  Mining  and Machine  Learning,  \nText Clustering, Outlier Detection, and \nCluster Evolution, Social Media Analytics  \nDr. D. W. C. P. Kumari  \nB. Sc.  in Comp.  Sci. (UCSC, SL) \nM.Phil.  (UCSC, SL)  \nPhD (QUT, Australia) \n(On Leave)  \nMr. K. D. C. G. Kapugama \nBCS (Ruhuna, SL)  \nReading  for PhD  (Monash,  Australia)  \n(On Leave)  Information  Security,  Process  \nAnalytics, Mathematical \nModelling,  Event  log analysis \nData Structure & Algorithms \nEvolutionary Algorithms  \nData Mining, Text Mining  \nMr. P. D. T. Chathuranga  \nBCS (Ruhuna,  SL) Natural  Language  Processing,  \nSentiment  Analysis,  Text  Mining,  \nMachine  Learning  \nMr. L.L. Gihan  Chathuranga  \nB.Sc  (Sabaragamuwa,  S.L) Machine  Learning,  Artificial  Neural  \nNetworks,  Artificia l Intelligence,  \nDa

In [414]:
docs[1]

(Document(page_content='B.Sc.  (UWU,  S.L.),  \nReading  for M.Sc.  (Peradeniya,  S.L.)  Machine  Learning,  Feature  Extraction,  \nDeep  Learning  \n1.4 Members  of the Academic  Support  Staff  \n \nAcademic  Supportive  Staff  Members  \nDesignation  Name  \nProgrammer  Cum  Systems  Analyst  Mr. H. G. U. Harankahadeniya  \nMr. B. H. Saranapala  \nMr. A. P. Luwishewa  \nInstructor  in Computer  Technology  Ms. P. B. N. K. De Silva  \nMr. U. V. Malawara  Arachchi \nMs. W.P. Priyanthi  \nMr. R. Wickramaratne \nMr. C.L. Wimalaratne \nMs. G.K. Mabula  \nMr. L.W.  Wellakkage \nMr. G.M.T. Ranjana \nMrs.  W. K. Shajith  \nMiss.  H. G. S. Priyangani  \n \n \n1.5 Course  Units  in Computer  Science  for B.Sc.  (General)  Degree  \nLevel  I - Semester  I \nCOM1111:   Basic  Concepts  of Information  Technology  (15 lecture  hrs.)    Overv iew \nof Computer System, Function of Computer System, Input/Output Peripherals, Computer \nStorages, Systems Software, Data type and Data representation, 

In [415]:
docs[2]

(Document(page_content='Rule Modeling  \n• Network  Monitor ing and Acquiring  and Managing  Information  \n• E-commerce information systems development, Model -drivers design, Goal, Business  \n& service Modelling  \n• Computational Geometry, Computer Graphics programming, Design and Analysis of \nAlgorithms, Graph Theory  \n• Computati onal Systems Biology, Bioinformatics, Modelling and Simulation, Neural \nComputing, stochastic modelling  \n• Embedded  Systems,  reconfigurable  computing,  Bioinformatics  \n• Knowledge Representation, Ontology, Semantic Web, Ontology Engineering, Mobile \nApplications  4  1.2 Head of the Department  \nDr. W.A.  Indika  \nB.Sc.  (Kelaniya,  S.L.),  M.Sc.  (Kelaniya,  S.L.),  PhD  (UCSC,  S.L.)  \n \n1.3 Members  of Academic  Staff  \n \n \nDesignation  Name  Specialization  \nSenior  \nLecturer  Mr. S. A. S. Lorensuhewa  \nB.Sc.  (Colombo, SL) \nM.Sc.  (Zhejiang,  China)  Comput er Applications,  Text  Mining  and \nText Classification Data Mining, \

In [416]:
docs[3]

(Document(page_content='the Bachelor of Computer Science (Honours) Degree courses.  \nThe Honours degree consists of examinations at the end of each semester or at the end  \nof course unit for research project.  \n \n1.9.1  Attendance  \nTo be eligible to sit for an examination of a theory or practical course unit, there should \nminimum attendance of 80%.  If a student fails to meet this requirement for a particular \ncourse  unit, he  or she will be considered  to have  failed  in that  course  unit (see  section  14.3.1 \nfor further details).  \n \n1.9.2  Examination Criteria \nEvaluation Methods  \n• Theory  examination  (written/oral)  \n• Practical  examination  \n• Continuous  assessment  \n• Assignment  \n• Report  \n• Presentation  \n \nGrading  Syst em \nAwarding of grades for course units will be done according to the grading system given in  \nthe Table in section 14.2  \n \nPass  in a Course  Unit  \n• A candidate  who obtains grade  C or  better  for a Course  Unit  wil

## Similarity Check by vector


In [417]:
embedding_vector = embedding.embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

Lecturer  
 
 
 
 
 
 
 
Probationary 
Lecturer  Dr. W. A. Mohotti  
B.Sc.  IT (Mo ratuwa, SL) 
M.Sc.  in IT (Moratuwa,  SL) 
Ph.D.  (QUT,  Australia)  Data  Mining  and Machine  Learning,  
Text Clustering, Outlier Detection, and 
Cluster Evolution, Social Media Analytics  
Dr. D. W. C. P. Kumari  
B. Sc.  in Comp.  Sci. (UCSC, SL) 
M.Phil.  (UCSC, SL)  
PhD (QUT, Australia) 
(On Leave)  
Mr. K. D. C. G. Kapugama 
BCS (Ruhuna, SL)  
Reading  for PhD  (Monash,  Australia)  
(On Leave)  Information  Security,  Process  
Analytics, Mathematical 
Modelling,  Event  log analysis 
Data Structure & Algorithms 
Evolutionary Algorithms  
Data Mining, Text Mining  
Mr. P. D. T. Chathuranga  
BCS (Ruhuna,  SL) Natural  Language  Processing,  
Sentiment  Analysis,  Text  Mining,  
Machine  Learning  
Mr. L.L. Gihan  Chathuranga  
B.Sc  (Sabaragamuwa,  S.L) Machine  Learning,  Artificial  Neural  
Networks,  Artificia l Intelligence,  
Data  mining  
Ms. H.D.  Supuni  Shashikala  
B.Sc.  (UWU,  S.

In [418]:
print(docs[1])

page_content='B.Sc.  (UWU,  S.L.),  
Reading  for M.Sc.  (Peradeniya,  S.L.)  Machine  Learning,  Feature  Extraction,  
Deep  Learning  
1.4 Members  of the Academic  Support  Staff  
 
Academic  Supportive  Staff  Members  
Designation  Name  
Programmer  Cum  Systems  Analyst  Mr. H. G. U. Harankahadeniya  
Mr. B. H. Saranapala  
Mr. A. P. Luwishewa  
Instructor  in Computer  Technology  Ms. P. B. N. K. De Silva  
Mr. U. V. Malawara  Arachchi 
Ms. W.P. Priyanthi  
Mr. R. Wickramaratne 
Mr. C.L. Wimalaratne 
Ms. G.K. Mabula  
Mr. L.W.  Wellakkage 
Mr. G.M.T. Ranjana 
Mrs.  W. K. Shajith  
Miss.  H. G. S. Priyangani  
 
 
1.5 Course  Units  in Computer  Science  for B.Sc.  (General)  Degree  
Level  I - Semester  I 
COM1111:   Basic  Concepts  of Information  Technology  (15 lecture  hrs.)    Overv iew 
of Computer System, Function of Computer System, Input/Output Peripherals, Computer 
Storages, Systems Software, Data type and Data representation, Computer Arithmetic, Ap - 
plication

In [419]:
print(docs[2])

page_content='Rule Modeling  
• Network  Monitor ing and Acquiring  and Managing  Information  
• E-commerce information systems development, Model -drivers design, Goal, Business  
& service Modelling  
• Computational Geometry, Computer Graphics programming, Design and Analysis of 
Algorithms, Graph Theory  
• Computati onal Systems Biology, Bioinformatics, Modelling and Simulation, Neural 
Computing, stochastic modelling  
• Embedded  Systems,  reconfigurable  computing,  Bioinformatics  
• Knowledge Representation, Ontology, Semantic Web, Ontology Engineering, Mobile 
Applications  4  1.2 Head of the Department  
Dr. W.A.  Indika  
B.Sc.  (Kelaniya,  S.L.),  M.Sc.  (Kelaniya,  S.L.),  PhD  (UCSC,  S.L.)  
 
1.3 Members  of Academic  Staff  
 
 
Designation  Name  Specialization  
Senior  
Lecturer  Mr. S. A. S. Lorensuhewa  
B.Sc.  (Colombo, SL) 
M.Sc.  (Zhejiang,  China)  Comput er Applications,  Text  Mining  and 
Text Classification Data Mining, 
Rule Extraction and  
Knowledge 

In [420]:
print(docs[3])

page_content='the Bachelor of Computer Science (Honours) Degree courses.  
The Honours degree consists of examinations at the end of each semester or at the end  
of course unit for research project.  
 
1.9.1  Attendance  
To be eligible to sit for an examination of a theory or practical course unit, there should 
minimum attendance of 80%.  If a student fails to meet this requirement for a particular 
course  unit, he  or she will be considered  to have  failed  in that  course  unit (see  section  14.3.1 
for further details).  
 
1.9.2  Examination Criteria 
Evaluation Methods  
• Theory  examination  (written/oral)  
• Practical  examination  
• Continuous  assessment  
• Assignment  
• Report  
• Presentation  
 
Grading  Syst em 
Awarding of grades for course units will be done according to the grading system given in  
the Table in section 14.2  
 
Pass  in a Course  Unit  
• A candidate  who obtains grade  C or  better  for a Course  Unit  will be considered  to 
have passed i

In [421]:
most_similar_doc = docs[0]


## generate the answer

In [422]:
from langchain.llms import OpenAI
llm = OpenAI(api_key=api_key)

In [423]:
from langchain.chains.question_answering import load_qa_chain
chain=load_qa_chain(llm=llm,chain_type='stuff')

In [424]:
response= chain.run(input_documents=docs,question=query)

In [425]:
response

' \n1. Dr. W. A. Mohotti - Probationary Lecturer \nB.Sc. IT (Moratuwa, SL) \nM.Sc. in IT (Moratuwa, SL) \nPh.D. (QUT, Australia) \nSpecializations: Data Mining and Machine Learning, Text Clustering, Outlier Detection, Cluster Evolution, Social Media Analytics \n\n2. Dr. D. W. C. P. Kumari - Lecturer \nB.Sc. in Comp. Sci. (UCSC, SL) \nM.Phil. (UCSC, SL) \nPhD (QUT, Australia) \nOn Leave \nSpecializations: Information Security, Process Analytics, Mathematical Modelling, Event log analysis, Data Structure & Algorithms, Evolutionary Algorithms, Data Mining, Text Mining \n\n3. Mr. K. D. C. G. Kapugama - Lecturer \nBCS (Ruhuna, SL) \nReading for PhD (Monash, Australia) \nOn Leave \nSpecializations: Information Security, Process Analytics, Mathematical Modelling, Event log analysis, Data Structure & Algorithms, Evolutionary Algorithms, Data Mining, Text Mining \n\n4. Mr. P. D. T. Chathuranga - Lecturer'

In [426]:
print(response)

 
1. Dr. W. A. Mohotti - Probationary Lecturer 
B.Sc. IT (Moratuwa, SL) 
M.Sc. in IT (Moratuwa, SL) 
Ph.D. (QUT, Australia) 
Specializations: Data Mining and Machine Learning, Text Clustering, Outlier Detection, Cluster Evolution, Social Media Analytics 

2. Dr. D. W. C. P. Kumari - Lecturer 
B.Sc. in Comp. Sci. (UCSC, SL) 
M.Phil. (UCSC, SL) 
PhD (QUT, Australia) 
On Leave 
Specializations: Information Security, Process Analytics, Mathematical Modelling, Event log analysis, Data Structure & Algorithms, Evolutionary Algorithms, Data Mining, Text Mining 

3. Mr. K. D. C. G. Kapugama - Lecturer 
BCS (Ruhuna, SL) 
Reading for PhD (Monash, Australia) 
On Leave 
Specializations: Information Security, Process Analytics, Mathematical Modelling, Event log analysis, Data Structure & Algorithms, Evolutionary Algorithms, Data Mining, Text Mining 

4. Mr. P. D. T. Chathuranga - Lecturer
