## API and LLM

In [1]:
from Keys import my_google_api_key
import os
os.environ["GOOGLE_API_KEY"] = my_google_api_key

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [4]:
import langchain

In [5]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.5,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

## Text Loaders

In [6]:
from langchain_community.document_loaders import PyPDFLoader 

In [7]:
from langchain_community.document_loaders.word_document import UnstructuredWordDocumentLoader

In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
CL_pdf=PyPDFLoader("Ankon Bhowmick Cover Letter.pdf").load()

In [10]:
CL_pdf

[Document(metadata={'source': 'Ankon Bhowmick Cover Letter.pdf', 'page': 0}, page_content='Dear Hiring Manager, \nI am writing to express my interest in the Data Scientist at Gainwell Company. I hold an MSc in \nData Science and Analytics from the University of Leeds and a BTech in Mathematics and \nComputing from DTU. I bring strong technical skills in data analysis, machine learning, and NLP , \nsupported by hands-on experience in Python, R, and SQL. \nDuring my internship at PwC, I conducted a comprehensive analysis of over 50,000 SAP incident \nrecords, identifying patterns and deploying a decision tree model that achieved 97.7% \naccuracy. I modularized the code for team-wide use, reﬂecting my attention to both model \nperformance and collaboration. \nMy academic background includes diverse machine learning projects, such as building a BERT-\nbased NLP classiﬁer on 32,000+ war-related news articles, creating CNN-RNN pipelines for \nimage captioning, and training Random Forest and 

In [11]:
loader = UnstructuredWordDocumentLoader("Ankon Bhowmick Resume.docx") 
doc = loader.load()

In [12]:
doc

[Document(metadata={'source': 'Ankon Bhowmick Resume.docx'}, page_content="Ankon Bhowmick\n\nankonbh@gmail.com \t\t\t\t\t\t\t\t\t \n\n+447769472138\n\n+919667055306\n\nEducation       \n\nMSc Data Science and Analytics, University of Leeds, (2023 - 2024), Upper Second Class Honours\n\nB.Tech Mathematics and Computing, Delhi Technological University, (2019 - 2023), CGPA – 8.33 (83.3%)\n\nInternships\n\nPwC, Gurgaon, India, (June 2022 - August 2022)\n\nIntern/Trainee, Advisory- Technology Consulting\n\nCompleted company training programs to gain a basic understanding in SAP ERP technology.\n\nCollaborated with manager to perform an in-depth analysis of a substantial data dump containing over 50,000 SAP incident reports from a client company.\n\nImplemented advanced data analysis techniques and Python programming to explore the data which contained 30 features, identifying both positive trends and potential areas of concern.\n\nLeveraged newfound expertise to construct a decision tree mod

## Text Splitting

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
r_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],  
    chunk_size = 1000, 
    chunk_overlap  = 100,
    length_function = len)

In [15]:
chunks_tx=r_splitter.split_text(doc[0].page_content)

In [18]:
len(chunks_tx)

12

In [19]:
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

In [20]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [22]:
import numpy as np

In [23]:
my_emb=np.asarray(embeddings.embed_documents(texts=chunks_tx))

In [24]:
my_emb.shape

(12, 768)

In [25]:
import faiss                   # make faiss available
index = faiss.IndexFlatL2(768)   # build the index
print(index.is_trained)
index.add(my_emb)                 # add vectors to the index
print(index.ntotal)

True
12


In [26]:
my_query="What did he do at PwC?"
q_emb=np.asarray(embeddings.embed_documents(texts=[my_query]))

In [27]:
D, I = index.search(q_emb,3)

In [28]:
D

array([[0.49475658, 0.5495293 , 0.5527823 ]], dtype=float32)

In [29]:
I

array([[ 1,  7, 11]], dtype=int64)

In [30]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.vectorstores import FAISS

## Using Langchain FAISS

In [38]:
chunks_doc=r_splitter.split_documents(doc)+r_splitter.split_documents(CL_pdf)

In [40]:
my_vecInd=FAISS.from_documents(chunks_doc, embeddings)

In [60]:
my_vecInd

<langchain_community.vectorstores.faiss.FAISS at 0x1b376fe36d0>

In [41]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=my_vecInd.as_retriever())

In [61]:
my_query="What did he do at PwC?"

In [62]:
langchain.debug=True

answer=chain({"question": my_query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What did he do at PwC?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Ankon Bhowmick\n\nankonbh@gmail.com \t\t\t\t\t\t\t\t\t \n\n+447769472138\n\n+919667055306\n\nEducation       \n\nMSc Data Science and Analytics, University of Leeds, (2023 - 2024), Upper Second Class Honours\n\nB.Tech Mathematics and Computing, Delhi Technological University, (2019 - 2023), CGPA – 8.33 (83.3%)\n\nInternships\n\nPwC, Gurgaon, India, (June 2022 - August 2022)\n\nIntern/Trainee, Advisory- Technology Consulting\n\nCompleted company training programs to gain a basic understanding in SAP E

In [63]:
from IPython.display import Markdown, display

print(answer['answer'])

During his internship at PwC, he conducted a comprehensive analysis of over 50,000 SAP incident records, identifying patterns and deploying a decision tree model that achieved 97.7% accuracy. He modularized the code for team-wide use. He also completed company training programs to gain a basic understanding in SAP ERP technology and implemented advanced data analysis techniques and Python programming to explore the data which contained 30 features, identifying both positive trends and potential areas of concern.



In [66]:
dir_list = os.listdir("C://Users//ASUS//Documents//Python Scripts//LnE Langchain")

In [67]:
dir_list

['.ipynb_checkpoints',
 'Ankon Bhowmick Cover Letter.pdf',
 'Ankon Bhowmick Resume.docx',
 'Learning.ipynb',
 'Project.ipynb']

In [68]:
for x in dir_list:
    if x.endswith(".pdf"):
        print(x)

Ankon Bhowmick Cover Letter.pdf
