In [1]:
import os
import chromadb
from langchain.vectorstores import FAISS
from langchain.text_splitter import SpacyTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

from langchain.chains import (
    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
)
from langchain_community.document_transformers import (
    LongContextReorder,
)

from langchain_openai import OpenAIEmbeddings

# from langchain_community.embeddings import OpenAIEmbeddings

from langchain_community.vectorstores import chroma
from langchain_community.vectorstores import Chroma
from langchain.document_transformers import (
    LongContextReorder,
)
from langchain_text_splitters import CharacterTextSplitter

from langchain.retrievers import MergerRetriever, ContextualCompressionRetriever


In [2]:
embeddings = OpenAIEmbeddings()
print("Embeddings loaded")

Embeddings loaded


# Data Presets using Chroma DB

In [7]:
# Loading Resume
resume = PyPDFLoader("Testing/resume/VishnuPrakash_Resume.pdf")
resume_load = resume.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
resume_splitted = text_splitter.split_documents(resume_load)

print(resume_splitted)

[Document(page_content='VISHNU PRAKASH\nData Scientist\nvishnucheppanam@gmail.com — +91 80 780 43398\nlinkedin.com/in/vishnuprksh — github.com/vishnuprksh — vishnuprakash.online\nPROFILE\nData Science aspirant with hands-on experience in machine learning and deep learning, seeking a\nchallenging role as a Data Scientist/Machine Learning Engineer. Eager to leverage skills in Artificial\nIntelligence, Neural Networks, and Algorithms to contribute to the companies and personal growth.\nPROJECTS\nGemInsights Git Link , Automating Exploratory Data Analysis (EDA) and Insight generation with\nthe help of latest Gemini engine by Google\n•EDA generation with the help of AutoViz and Image analysis with Gemini-Pro-Vision.\n•Hallucination check with Trulense-Eval.\n•UI with Streamlit.\nTradePilot Git Link , Empowering stock market price prediction with sentiment analysis from News,\nTwitter and Reddit\n•Stock price history collected from Yahoo Finance and Text data scraped from Economic Times,\nRe

In [8]:
# loading job description
path = "Testing/resume/job_desc.txt"
with open(path, "r") as file:
    job_desc = file.read()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
jd_splitted = text_splitter.split_text(job_desc)

print(jd_splitted[0]) 

As a Junior Machine Learning Engineer at our cutting-edge company, your role is crucial in developing the future of AI chatbots and Large Language Models (LLMs) like Llama2, with a special emphasis on integrating these innovations with WIX platforms. Your work will involve a deep dive into AWS services, Data ETL, GitHub for code collaboration, and the hands-on building of models, all while leveraging your strong quantitative and programming background. This role demands not only technical expertise but also a creative approach to problem-solving and code development. You will work in close partnership with the CTO, ensuring that your contributions leave a lasting impact on our AI-driven solutions. Additionally, your collaboration with front-end developers is key to weaving AI functionalities into WIX-based applications seamlessly, enhancing user experiences and pushing the boundaries of what our digital platforms can achieve.


## Create Vector Storage

In [15]:
# vectorDB for resume
chroma_resume = Chroma.from_documents(
        resume_splitted,embeddings,
        collection_metadata={"hnsw:space": "cosine"},persist_directory="vector_storage/resume_store" # l2 is the default
    )
# vectorDB for JD
chroma_jd = Chroma.from_texts(
    jd_splitted,embeddings,collection_metadata = {"hnsw:space": "cosine"},
    persist_directory="vector_storage/jd_store" 
)

In [16]:
chroma_resume

<langchain_community.vectorstores.chroma.Chroma at 0x14b6ce410>

## Load Vector stores


In [17]:
load_chroma_resume = Chroma(persist_directory="vector_storage/resume_store",embedding_function=embeddings)
load_chroma_jd = Chroma(persist_directory="vector_storage/jd_store",embedding_function=embeddings)

## Initialize Merge Retriever and Perform semantic search

In [18]:
load_chroma_resume

<langchain_community.vectorstores.chroma.Chroma at 0x14b7a9710>

In [19]:
retriever_resume = load_chroma_resume.as_retriever(search_type = "similarity", search_kwargs = {"k":1})

retriever_jd = load_chroma_jd.as_retriever(search_type = "similarity", search_kwargs = {"k":1})


In [29]:
print(retriever_resume)

tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x14b7a9710> search_kwargs={'k': 1}


In [20]:
merged = MergerRetriever(retrievers=[retriever_resume, retriever_jd])


In [21]:
for chunks in merged.get_relevant_documents("Machine Learning"):
    print(chunks.page_content)

VISHNU PRAKASH
Data Scientist
vishnucheppanam@gmail.com — +91 80 780 43398
linkedin.com/in/vishnuprksh — github.com/vishnuprksh — vishnuprakash.online
PROFILE
Data Science aspirant with hands-on experience in machine learning and deep learning, seeking a
challenging role as a Data Scientist/Machine Learning Engineer. Eager to leverage skills in Artificial
Intelligence, Neural Networks, and Algorithms to contribute to the companies and personal growth.
PROJECTS
GemInsights Git Link , Automating Exploratory Data Analysis (EDA) and Insight generation with
the help of latest Gemini engine by Google
•EDA generation with the help of AutoViz and Image analysis with Gemini-Pro-Vision.
•Hallucination check with Trulense-Eval.
•UI with Streamlit.
TradePilot Git Link , Empowering stock market price prediction with sentiment analysis from News,
Twitter and Reddit
•Stock price history collected from Yahoo Finance and Text data scraped from Economic Times,
Reddit, and Twitter.
•Sentiment analysis wi

In [22]:
retriever_resumes = chroma_resume.as_retriever(search_type = "similarity", search_kwargs = {"k":1})

retriever_jds = chroma_jd.as_retriever(search_type = "similarity", search_kwargs = {"k":1})

In [27]:
print(retriever_jds)

tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x14b1aac10> search_kwargs={'k': 1}


In [23]:
mergeds = MergerRetriever(retrievers=[retriever_resumes, retriever_jds])

In [28]:
print(mergeds)

retrievers=[VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x14b6ce410>, search_kwargs={'k': 1}), VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x14b1aac10>, search_kwargs={'k': 1})]


In [24]:
for chunks in mergeds.get_relevant_documents("Machine Learning"):
    print(chunks.page_content)

VISHNU PRAKASH
Data Scientist
vishnucheppanam@gmail.com — +91 80 780 43398
linkedin.com/in/vishnuprksh — github.com/vishnuprksh — vishnuprakash.online
PROFILE
Data Science aspirant with hands-on experience in machine learning and deep learning, seeking a
challenging role as a Data Scientist/Machine Learning Engineer. Eager to leverage skills in Artificial
Intelligence, Neural Networks, and Algorithms to contribute to the companies and personal growth.
PROJECTS
GemInsights Git Link , Automating Exploratory Data Analysis (EDA) and Insight generation with
the help of latest Gemini engine by Google
•EDA generation with the help of AutoViz and Image analysis with Gemini-Pro-Vision.
•Hallucination check with Trulense-Eval.
•UI with Streamlit.
TradePilot Git Link , Empowering stock market price prediction with sentiment analysis from News,
Twitter and Reddit
•Stock price history collected from Yahoo Finance and Text data scraped from Economic Times,
Reddit, and Twitter.
•Sentiment analysis wi

In [None]:
for chunks in merged.get_relevant_documents("Machine Learning"):
    print(chunks.page_content)

# Data Presets using FAISS Index

In [28]:
# Loading Resume
resume = PyPDFLoader("Testing/resume/VishnuPrakash_Resume.pdf")
resume_load = resume.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
resume_splitted = text_splitter.split_documents(resume_load)

print(resume_splitted[0])

page_content='VISHNU PRAKASH\nData Scientist\nvishnucheppanam@gmail.com — +91 80 780 43398\nlinkedin.com/in/vishnuprksh — github.com/vishnuprksh — vishnuprakash.online\nPROFILE\nData Science aspirant with hands-on experience in machine learning and deep learning, seeking a\nchallenging role as a Data Scientist/Machine Learning Engineer. Eager to leverage skills in Artificial\nIntelligence, Neural Networks, and Algorithms to contribute to the companies and personal growth.\nPROJECTS\nGemInsights Git Link , Automating Exploratory Data Analysis (EDA) and Insight generation with\nthe help of latest Gemini engine by Google\n•EDA generation with the help of AutoViz and Image analysis with Gemini-Pro-Vision.\n•Hallucination check with Trulense-Eval.\n•UI with Streamlit.\nTradePilot Git Link , Empowering stock market price prediction with sentiment analysis from News,\nTwitter and Reddit\n•Stock price history collected from Yahoo Finance and Text data scraped from Economic Times,\nReddit, and 

In [26]:
path = "Testing/resume/job_desc.txt"
with open(path, "r") as file:
    job_desc = file.read()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
jd_splitted = text_splitter.split_text(job_desc)

print(jd_splitted[0]) 

As a Junior Machine Learning Engineer at our cutting-edge company, your role is crucial in developing the future of AI chatbots and Large Language Models (LLMs) like Llama2, with a special emphasis on integrating these innovations with WIX platforms. Your work will involve a deep dive into AWS services, Data ETL, GitHub for code collaboration, and the hands-on building of models, all while leveraging your strong quantitative and programming background. This role demands not only technical expertise but also a creative approach to problem-solving and code development. You will work in close partnership with the CTO, ensuring that your contributions leave a lasting impact on our AI-driven solutions. Additionally, your collaboration with front-end developers is key to weaving AI functionalities into WIX-based applications seamlessly, enhancing user experiences and pushing the boundaries of what our digital platforms can achieve.


## Create vectore storage

In [32]:
# vectorDB for resume
chroma_resume = FAISS.from_documents(
        resume_splitted,embeddings,
        collection_metadata={"hnsw:space": "cosine"},persist_directory="vector_storage/resume_store" # l2 is the default
    )
# vectorDB for JD
chroma_jd = FAISS.from_texts(
    jd_splitted,embeddings,collection_metadata = {"hnsw:space": "cosine"},
    persist_directory="vector_storage/jd_store" 
)

TypeError: FAISS.__init__() got an unexpected keyword argument 'persist_directory'

In [24]:
# from langchain_openai import OpenAIEmbeddings

# from langchain_openai import ChatOpenAI

# from langchain_community.vectorstores import FAISS

# from langchain_text_splitters import CharacterTextSplitter

# from PyPDF2 import PdfReader

# def embeddings(text):
# 	text_splitter = SpacyTextSplitter()
# 	text_splitter = SpacyTextSplitter(pipeline="en_core_web_sm")
# 	texts = text_splitter.split_text(text)
# 	embeddings = OpenAIEmbeddings()
# 	docsearch = FAISS.from_texts(texts, embeddings)
# 	retriever = docsearch.similarity_search(text)
# 	return retriever
# def save_vector(resume):
#     """create embeddings for Resume """
#     with open(resume,'rb') as f:
#         pdf_reader = PdfReader(resume)
#         text = ""
#         for page in pdf_reader.pages:
#             text += page.extract_text()
#     # Split the document into chunks
#     text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#         encoding="cl100k_base", chunk_size=100, chunk_overlap=0
#         )
#     texts = text_splitter.split_text(text)

#     embeddings = OpenAIEmbeddings()
#     docsearch = FAISS.from_texts(texts, embeddings)
#     retriever = docsearch.as_retriever(search_type='similarity search')
#     return retriever

In [25]:

# loader = PyPDFLoader("Testing/resume/VishnuPrakash_Resume.pdf")
# # pages = loader.load_and_split()
# val = save_vector(loader)
# print(val)
