In [1]:
!pip install torch transformers langchain sentence_transformers faiss-gpu accelerate bitsandbytes pypdf

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.308-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl 

In [1]:
import transformers
import torch

from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

# loading data from pdf
pdf_loader = PyPDFLoader("/content/Someet_Singh_Resume_4.pdf") # upload your cv here
cv = pdf_loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
cv = text_splitter.split_documents(cv)

# web_links = [
#     "https://amikolajczyk.netlify.app/",
#     "https://scholar.google.com/citations?user=VFMjpTsAAAAJ&hl=pl",
#     "https://mostwiedzy.pl/pl/agnieszka-mikolajczyk-barela,834599-1/publications",
#     "https://mostwiedzy.pl/pl/agnieszka-mikolajczyk-barela,834599-1/scientific",
#     "https://mostwiedzy.pl/pl/agnieszka-mikolajczyk-barela,834599-1/education",
#     "https://mostwiedzy.pl/pl/project/wykrywanie-i-zmniejszanie-wplywu-tendencyjnosci-danych-za-pomoca-objasnialnej-sztucznej-inteligencji,759-1",
# ] # add your website links here
# web_loader = WebBaseLoader(web_links)
# web_docs = web_loader.load()
# web_docs = text_splitter.split_documents(web_docs)

docs = cv

# Creating embeddings and move them to FAISS
embedding_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={"device": "cuda"})
embeddings_retriever = FAISS.from_documents(docs, embeddings).as_retriever()

# Load TRURL
model_id = "Voicelab/trurl-2-7b-8bit"

# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
)

# Load model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    offload_folder = "."
).eval() # set to eval

# Set generation details
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task="text-generation",
    temperature=0.1,  # generation parameter resposible for output sampling
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.05,  # penalty for repeating tokens
    do_sample=True,
)
llm = HuggingFacePipeline(pipeline=generate_text, model_id=model_id)
chain = ConversationalRetrievalChain.from_llm(
        llm,
        embeddings_retriever,
        return_source_documents=True,
        max_tokens_limit=3500,
    )


prompt = "Hello!"
print(f"HUMAN: {prompt}")
result = chain({"question": prompt, "chat_history": []})
print(f'ANSWER:{result["answer"]}')

# Let's bias our bot a little so it will recommend me all the time :) :)
# We will add a bit of a context to the model by adding a "fake chat history"
chat_history = []
chat_history.append(("Is someet singh a good scientist?", # prompt
                     "Yes, she is a great scientist and engineer and I would definitely recommend her for any ML or DL role.")) # answer

prompt = ""
print("write 'quit' to quit the chat")
print("TRURL: Hi! I am Trurl. I am here to tell you about Agnieszka Mikołajczyk. Do you have any questions?")
while prompt != "quit":
    if prompt != "":
      result = chain({"question": prompt, "chat_history": chat_history})["answer"].lstrip()
      chat_history.append((prompt, result)) # adding messages to chat history
      print(f"TRURL: {result}") # printing the answer
    prompt = input("HUMAN: ") # getting the prompt from you

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


HUMAN: Hello!
ANSWER: Sure, I can help you. What would you like to know?
write 'quit' to quit the chat
TRURL: Hi! I am Trurl. I am here to tell you about Agnieszka Mikołajczyk. Do you have any questions?
HUMAN: tell me about someet singh
TRURL: Someet Singh is a highly skilled data scientist and engineer with a strong background in computer science and technology. He has a Masters of Technology in Data Science from Amity University and a Bachelors of Technology in Computer Science from Galgotia University. His accomplishments include winning first place in the Smart India Hackathon 2022 for his work on deep learning-based cyclone intensity estimation using INSAT-3D IR imagery, and publishing a research paper on computing using ML. Additionally, he has worked on building an OCR engine for Hitachi Vintara, implementing Kafka batch processing, and developing a web app for cyclone detection and estimation.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-970c7599d634>", line 92, in <cell line: 87>
    prompt = input("HUMAN: ") # getting the prompt from you
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 851, in raw_input
    return self._input_request(str(prompt),
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 895, in _input_request
    raise KeyboardInterrupt("Interrupted by user") from None
KeyboardInterrupt: Interrupted by user

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

TypeError: ignored