In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [2]:
from langchain.llms import HuggingFaceHub

In [3]:

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [4]:
model.save("D:\HR-bot\HR-BOT\models\embedding")

In [4]:
PINECONE_API_KEY = "765789ec-4ce4-4858-be09-31128f07c5f5"
PINECONE_API_ENV = "gcp-starter"

In [13]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="WFO-FAQs_June'23.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [14]:
extracted_data = load_pdf(r"D:\kanini-hackathon\hr-bot\HR-BOT\data")

In [15]:
len(extracted_data)

2

In [9]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 72


In [11]:
text_chunks[0]

Document(page_content='Table of Contents  \n1. Office Decorum  \n1.1 Office Timing  \n1.2 Leave Policy                                             \n1.3 Dress Code  \n1.4 Cleanliness  \n1.5 Work from home policy  \n1.6 ID Card / Access card  \n1.7 Unauthorized Access  \n2. Disaster recovery P olicy    \n3. Clean Desk & Clear screen Policy    \n4. Open Door Policy                                                \n5. Confidentiality and security policy                    \n5.1 Company and Personnel Information', metadata={'source': 'D:\\kanini-hackathon\\hr-bot\\HR-BOT\\data\\document.pdf', 'page': 0})

In [12]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="D:\HR-bot\HR-BOT\models\embedding")
    return embeddings

In [13]:
embeddings = download_hugging_face_embeddings()

In [15]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="hr-bot"

In [31]:


#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings,metadatas=[{'pdf_name': p.metadata["source"].split("\\")[-1].lower()} for p in text_chunks], index_name=index_name)

In [22]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "tell me about dress code ?"

docs=docsearch.similarity_search(query, k=10)

print("Result", docs)

Result [Document(page_content='• Earned leave will not be compensated for notice period.  \n \nH. Trainees  \n• One leave per month can be availed during the training period.  \n \nI. Late In/ Early out with Prior Permission  \nEmployee s can avail monthly two permissions ie. Two hours each.  Exceeding two hours \nwould be considered as half day leave/month.  \n \n1.3  Dress Code  \nAll employees are expected to be properly groom ed and wear proper formal dress.   \n• Mon -Thur – Formal wear / KANINI T-shirts  \n• Friday – Casuals (No shots(3/4ths), No torn jeans, No skirts, No sleeveless) + \nKANINI T -shirts  \n1.4 Cleanliness  \nEmployees are expected to keep their surroundings neat, clean, and tidy. Use \nprovided dustbins to dispose of any wastes. Each employee must take responsibility for \nthe workplace, dining hall, and of the office in general.  \n \n1.5 Work from home policy  \nA. Policy Statement  \nWork at home was brought in for the situation where employee has to complete

In [17]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [23]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [24]:
# llm=CTransformers(model=r"D:\HR-bot\HR-BOT\models\llm\llama-2-7b-chat.ggmlv3.q4_0.bin",
#                   model_type="llama",
#                   config={'max_new_tokens':512,
#                           'temperature':0.8})

In [28]:
from langchain.llms import ctransformers

In [3]:
from ctransformers import AutoModelForCausalLM

In [4]:
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", model_type="mistral", gpu_layers=50)

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1002.46it/s]


FileNotFoundError: Could not find module 'C:\Users\Administrator\anaconda3\envs\hrbots\Lib\site-packages\ctransformers\lib\cuda\ctransformers.dll' (or one of its dependencies). Try using the full path with constructor syntax.

In [5]:
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token="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")

NameError: name 'HuggingFaceHub' is not defined

In [25]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 4}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [26]:

user_input="tell me about dress code ?"
result=qa({"query": user_input})
print("Response : ", result["result"])

Response :  All employees are expected to be properly groom ed and wear proper formal dress.


In [27]:
text_chunks[0].metadata["source"].split("\\")[-1].lower()

'document.pdf'

In [27]:
# Using list comprehension to create a list of dictionaries
list_of_dicts = [
    {'pdf_name': t.metadata["source"].split("\\")[-1].lower()}
    for t in text_chunks
]

# Printing the list of dictionaries
for person in list_of_dicts:
    print(person)


{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': 'document.pdf'}
{'pdf_name': '