# Anurag Velekat
## NLP Project
## Implementing a RAG based Chat Bot that talks about Chess.

In [None]:
!pip install --upgrade --quiet pinecone-client pinecone-text pinecone-notebooks

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for wget (setup.py) ... [?25l[?25hdone


In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadat

In [None]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [None]:
api_key="your-pinecone-api-key"

In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever


In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
index_name = "hybrid-search-langchain-pinecone"

pc = Pinecone(api_key=api_key)

if index_name not in pc.list_indexes().names():
  pc.create_index(
      name = index_name,
      dimension = 384, # dim of dense vector
      metric = "dotproduct", # sparse values
      spec = ServerlessSpec(cloud='aws', region="us-east-1"),
  )

In [None]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x7f099f2d6440>

In [None]:
import os
os.environ['HF_TOKEN'] = "your-huggingface-api-token"

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()
bm25_encoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7f0887b4b370>

In [None]:
file_paths = ["/content/caro_kann.txt", "/content/hikaru.txt", "/content/kings_indian.txt", "/content/magnus.txt"]

files = []
for file_name in file_paths:
  with open(file_name, 'r') as f:
    files.append(f.read())
  f.close()


In [None]:
import nltk
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [None]:
# apply tf-idf to files
bm25_encoder.fit(files)

# store values to json file
bm25_encoder.dump("bm25_values.json")
bm25_encoder = BM25Encoder().load("bm25_values.json")

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x7f088754d000>, index=<pinecone.data.index.Index object at 0x7f099f2d6440>)

In [None]:
retriever.add_texts(files)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
retriever.invoke("When was Hikaru Nakamura born?")

[Document(metadata={'score': 0.677066565}, page_content='Hikaru Nakamura: The Speed Chess King and Modern Chess Icon\nHikaru Nakamura, born on December 9, 1987, in Hirakata, Japan, is one of the most influential and celebrated chess players of the 21st century. As a five-time U.S. Chess Champion, reigning Speed Chess Champion, and a pioneer in the online chess boom, Nakamura has left an indelible mark on the game. Known for his creative playing style, lightning-fast calculation, and engaging personality, he has successfully bridged the gap between traditional chess and modern digital platforms.\n\nThis essay delves into Nakamura’s career, achievements, playing style, and his role in popularizing chess for a global audience.\n\nEarly Life and Rise to Prominence\nHikaru Nakamura’s journey into chess began at the age of seven when he was introduced to the game by his stepfather, FIDE Master Sunil Weeramantry. His talent was evident from a young age, and he quickly rose through the ranks o

In [None]:
retriever.invoke("When was Hikaru Nakamura born?")[0]

Document(metadata={'score': 0.677066565}, page_content='Hikaru Nakamura: The Speed Chess King and Modern Chess Icon\nHikaru Nakamura, born on December 9, 1987, in Hirakata, Japan, is one of the most influential and celebrated chess players of the 21st century. As a five-time U.S. Chess Champion, reigning Speed Chess Champion, and a pioneer in the online chess boom, Nakamura has left an indelible mark on the game. Known for his creative playing style, lightning-fast calculation, and engaging personality, he has successfully bridged the gap between traditional chess and modern digital platforms.\n\nThis essay delves into Nakamura’s career, achievements, playing style, and his role in popularizing chess for a global audience.\n\nEarly Life and Rise to Prominence\nHikaru Nakamura’s journey into chess began at the age of seven when he was introduced to the game by his stepfather, FIDE Master Sunil Weeramantry. His talent was evident from a young age, and he quickly rose through the ranks of

In [None]:
retriever.invoke("When was Magnus Carlsen born?")[0]

Document(metadata={'score': 0.667161345}, page_content='Magnus Carlsen: The Reigning King of Chess\nMagnus Carlsen, born on November 30, 1990, in Tønsberg, Norway, is widely regarded as one of the greatest chess players of all time. A chess prodigy who rose to international prominence as a teenager, Carlsen has dominated the chess world with his exceptional talent, strategic brilliance, and versatile style of play. He has held the title of World Chess Champion since 2013 and is also a reigning champion in rapid and blitz formats, solidifying his reputation as the undisputed king of modern chess.\n\nIn this essay, we will explore Carlsen’s early life and rise to fame, his remarkable achievements, his unique playing style, and his contributions to the global chess community.\n\nEarly Life and Chess Beginnings\nSven Magnus Øen Carlsen was introduced to chess at the age of five by his father, Henrik Carlsen. His early interest in logical games and puzzles, combined with a prodigious memory

In [None]:
import google.generativeai as genai
genai.configure(api_key="your-gemini-api-key")

In [None]:
def implement_rag(query, k=2):
  top_k_relevant_documents = retriever.invoke(query)[:k]
  prompt = f'''You are a highly knowledgeable assistant that answers questions based on provided context.
  Use the context below to generate an accurate and concise response to the query.
  If the context does not contain enough information, respond with "I don't know" or suggest a follow-up.
  ### Query:
  {query}

  ### Relevant Documents:
  {top_k_relevant_documents}

  ### Instructions:
  - Base your answer only on the relevant documents provided.
  - If there are conflicting pieces of information, prioritize the most detailed and credible one.
  - Provide a detailed explanation if required.
  - Do not include information outside the context.

  ### Response:
  '''
  model = genai.GenerativeModel('gemini-1.5-flash')
  response = model.generate_content(prompt)
  generated_content = response.text

  return generated_content

In [None]:
query = input("Enter your question: ")
implement_rag(query)

Enter your question: When was Hikaru born?


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 2706.62ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1847.62ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1491.26ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 2525.66ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 3489.91ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1340.18ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 2732.01ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:

'Hikaru Nakamura was born on December 9, 1987, in Hirakata, Japan.\n'

In [None]:
query = input("Enter your question: ")
implement_rag(query)

Enter your question: What is the most common follow up in the Caro Kann Defense played by white?


"The provided text describes several variations of the Caro-Kann Defense, but it does not state which follow-up move by White is most common.  Therefore, I don't know.\n"

In [None]:
query = input("Enter your question: ")
implement_rag(query)

Enter your question: How does the Caro Kann defense opening begin?


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 2223.40ms


'The Caro-Kann Defense begins with the moves 1. e4 c6.\n'

In [None]:
query = input("Enter your question: ")
implement_rag(query)

Enter your question: what are the first 4 moves played in the King's Indian Defense?


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 2322.66ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1566.17ms


"The first four moves of the King's Indian Defense are 1. d4 Nf6 2. c4 g6.\n"