In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import chromadb
from PyPDF2 import PdfReader
import re, uuid, time, random
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from IPython.display import Image

In [10]:
# rename data to be descriptive and strict
INPUT_FILE = "reference_document.pdf"
reader = PdfReader(INPUT_FILE)
data = ""

for i in range(len(reader.pages)):
    data += reader.pages[i].extract_text()

In [50]:
CHUNK_SIZE = 400
CHUNK_OVERLAP = 30

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
texts = text_splitter.split_text(data)

In [51]:
client = chromadb.Client()
vstore = client.get_or_create_collection(name="football", metadata={"hnsw:space": "cosine"})
start = time.time()
vstore.add(documents= texts, metadatas=[None]*len(texts), ids=[str(uuid.uuid1()) for _ in texts])
end = time.time()

In [49]:
client.delete_collection("football")

In [48]:
minilm_results = {}

search_k = 5
docs = vstore.query(query_texts=["Who won the 2024 super bowl?"], n_results=search_k, include=['documents'])
strings = ''
for i, doc in enumerate(docs['documents'][0]):
    strings += f"-Document {str(i)}:"
    strings += doc
    strings += "\n\n"
minilm_results["Who won the 2024 super bowl?"] = strings

In [42]:
for pair in minilm_results.items():
    print(pair[0])
    print('\n')
    print(pair[1])
    print('\n')

Who won the 2024 super bowl?


-Document 0:marked the third straight year that the Super Bowl had been played in the  Western United States , 
following host cities  Inglewood, California , in 2022 and  Glendale, Arizona , in 2023.

-Document 1:City Chiefs  defeated the  National Football Conference  (NFC) champion  San Francisco 49ers  25–22 
in overtime. The Chiefs became the first team to win back -to-back Super Bowls since the  New

-Document 2:Super Bowl LVIII  was an  American football  game played to determine the champion of the  National

-Document 3:this game established them as a  dynasty .[9] It was the second Super Bowl to be decided in  overtime ,

-Document 4:the first being  Super Bowl LI , seven years earlier.[10][11] Chiefs quarterback  Patrick Mahomes  was 
named  Super Bowl Most Valuable Player  (MVP),  completing  34 of 46 passes for 333 yards,






# Use RAG with a LLM

In [1]:
import os
from dotenv import load_dotenv
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams

In [2]:
#config Watsonx.ai environment
load_dotenv()
api_key = os.getenv("WXAI_APIKEY", None)
ibm_cloud_url = os.getenv("WXAI_URL", None)
project_id = os.getenv("WXAI_PROJECT_ID", None)

creds = {"url":ibm_cloud_url,
         "apikey": api_key}

In [5]:
params = {
        GenParams.DECODING_METHOD: 'greedy',
        GenParams.MIN_NEW_TOKENS: 40,
        GenParams.MAX_NEW_TOKENS: 75,
#        GenParams.RANDOM_SEED: 42,
#        GenParams.TEMPERATURE: .8,
        GenParams.REPETITION_PENALTY: 1.0,
    }

model = Model(model_id='ibm/granite-13b-chat-v2', params=params,
              credentials=creds, project_id=project_id)

### Generation without RAG

In [8]:
print(model.generate_text("Who won the 2024 super bowl?"))


The Super Bowl LVI is scheduled to take place on February 13, 2022, at SoFi Stadium in Inglewood, California. As of now, the teams have not been determined, but the American Football Conference (AFC) and the National Football Conference (NFC) champions will face off in this annual championship game. The winner of the Super Bowl L


### Generation with RAG

In [53]:
def rag_function(query, vstore, llm, doc_k=3):
    """
    Perform RAG on a user Query
    """
    docs = vstore.query(query_texts=[query],
                        n_results=doc_k, include=['documents'])
    strings = ''
    for i, doc in enumerate(docs['documents'][0]):
        strings += f"-Document {str(i)}:"
        strings += doc
        strings += "\n\n"
    prompt = f"""<|system|>
    You are Granite Chat, an AI language model developed by IBM. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior. Use the provided context to answer a user question.
    Context: {strings}
    <|user|>
    Query: {query}
    <|assistant|>"""
    return llm.generate_text(prompt)

In [33]:
q = "Who won the 2024 super bowl?"

In [54]:
print(rag_function(q, vstore, model))



Response: The Kansas City Chiefs won the 2024 Super Bowl. This information can be inferred from the context provided in Document 0, which states that the Kansas City Chiefs defeated the San Francisco 49ers in Super Bowl LVIII (2024) with a score of 25-22.
