In [1]:
import ollama
import time
import os
import json
import numpy as np
from numpy.linalg import norm
from openai import OpenAI
import time

In [3]:
def parse_file(filename):
    with open(filename, encoding="utf-8-sig") as f:
        paragraphs = []
        buffer = []
        for line in f.readlines():
            line = line.strip()
            if line:
                buffer.append(line)
            elif len(buffer):
                paragraphs.append((" ").join(buffer))
                buffer = []
        if len(buffer):
            paragraphs.append((" ").join(buffer))
        return paragraphs


def save_embeddings(filename, embeddings):
    # create dir if it doesn't exist
    if not os.path.exists("embeddings"):
        os.makedirs("embeddings")
    # dump embeddings to json
    with open(f"embeddings/{filename}.json", "w") as f:
        json.dump(embeddings, f)




def load_embeddings(filename):
    # check if file exists
    if not os.path.exists(f"embeddings/{filename}.json"):
        return False
    # load embeddings from json
    with open(f"embeddings/{filename}.json", "r") as f:
        return json.load(f)



def get_embeddings(filename, modelname, chunks):
    # check if embeddings are already saved
    if (embeddings := load_embeddings(filename)) is not False:
        return embeddings
    # get embeddings from ollama
    embeddings = [
        ollama.embeddings(model=modelname, prompt=chunk)["embedding"]
        for chunk in chunks
    ]
    # save embeddings
    save_embeddings(filename, embeddings)
    return embeddings

def find_most_similar(needle, haystack):
    needle_norm = norm(needle)

    similarity_scores = [
        np.dot(needle, item) / (needle_norm * norm(item)) for item in haystack
    ]

    # print(needle, haystack[0])
    # print(np.dot(needle, haystack[0]))
    return sorted(zip(similarity_scores, range(len(haystack))), reverse=True)


def rag_function(prompt):

    SYSTEM_PROMPT = """You are a helpful reading assistant who answers questions
        based on snippets of text provided in context. Answer only using the context provided,
        being as concise as possible. If you're unsure, just say that you don't know.
        Do not give answers that are outside the context given.
        Answer in about 150 words for each prompt.
        Context:
    """
    start_time = time.time()
    
    filename = "data.txt"
    paragraphs = parse_file(filename)

    embeddings = get_embeddings(filename, "nomic-embed-text", paragraphs)



    
    prompt_embedding = ollama.embeddings(model="nomic-embed-text", prompt=prompt)["embedding"]
    most_similar_chunks = find_most_similar(prompt_embedding, embeddings)[:5]



    response = ollama.chat(
    model="mistral-nemo:latest" ,
    messages=[
        {
            "role": "system",
            "content": SYSTEM_PROMPT
            + "\n".join(paragraphs[item[1]] for item in most_similar_chunks),
        },
        {"role": "user", "content": prompt},
    ],
    )


    
    end_time = time.time()
    print("\n")
    print(response['message']['content'])
    print("\n")
    print(f"Execution Time {end_time-start_time} ")


In [4]:
#Strainght forward Question, with 27B model
prompt="Enlist the achievements of Sachin Tendulkar"
rag_function(prompt)



**Sachin Tendulkar's Achievements:**

- **International Runs:**
  - Test: Over 15,000 runs (Highest all-time)
  - ODI: Over 18,000 runs (Highest all-time)

- **Player of the Match Awards:** Most in international cricket

- **ICC Awards:**
  - Sir Garfield Sobers Trophy for Cricketer of the Year (2010)
  - Player of the Tournament (2003 Cricket World Cup)

- **Team trophies:**
  - ICC Champions Trophy (2002, joint winner)
  - Cricket World Cup (2011)

- **Indian Government Awards:**
  - Arjuna Award (1994)
  - Khel Ratna Award (1997)
  - Padma Shri (1998)
  - Padma Vibhushan (2008)
  - Bharat Ratna (2013)

- **Other Recognitions:**
  - Wisden Cricketers' Almanack ranked him second-greatest Test and ODI batsman (2002)
  - ICC Cricket Hall of Fame inductee (2019)


Execution Time 12.142996788024902 


In [5]:
#Reasoning RAG

prompt="Who is better batsman Sachin or Sourav?"
rag_function(prompt)



Based on the context provided, Tendulkar is regarded as one of the greatest batsmen in cricket history. He holds the record for the highest run-scorer in both ODI and Test cricket. On the other hand, Ganguly is popularly called the 'Maharaja of Indian Cricket' and is considered one of India's most successful captains. However, Tendulkar's batting records make him a better batsman compared to Ganguly.


Execution Time 4.8565287590026855 


In [11]:
#Reasoning RAG

prompt="Who is better Captain Dhoni or Sourav? Answer in one word only"
rag_function(prompt)



Dhoni


Execution Time 1.456542730331421 


In [8]:
#Analytical RAG

prompt="Difference between total runs scored by Sachin and Dhoni? Just give the number, No explanation needed"
rag_function(prompt)



Sachin Tendulkar: 34,369
MS Dhoni: 25,875


Execution Time 1.394803762435913 


In [9]:
#Analytical RAG

prompt="Who has most accolades Dhoni or sachin?"
rag_function(prompt)



Dhoni has received the Khel Ratna Award (2008), Padma Shri (2009), and Padma Bhushan (2018). Sachin Tendulkar has been nominated to Rajya Sabha (2012-2018) and is the recipient of Bharat Ratna (2013). Dhoni has more awards, but Tendulkar's Bharat Ratna is India's highest civilian award.


Execution Time 5.043278694152832 


In [10]:
#Analytical RAG

prompt="Amongst three, who is the best player in World cups? Answer in one word only"
rag_function(prompt)



Tendulkar


Execution Time 1.6475763320922852 


In [12]:
#Analytical RAG

prompt="Amongst three, who has most awards?"
rag_function(prompt)



Sachin Tendulkar has received the most awards among the three: Arjuna Award (1994), Khel Ratna Award (1997), Padma Shri (1998), Bharat Ratna (2013), Sir Garfield Sobers Trophy (2010).


Execution Time 3.7773056030273438 


In [13]:
#Analytical RAG

prompt="Amongst three, who has most awards? Give name of player and number only"
rag_function(prompt)



Sachin Tendulkar: 7


Execution Time 1.2189795970916748 


In [15]:
#Summary RAG

prompt="Give me summary of Dhoni's career, in less than 200 words? Answer in bullet points"
rag_function(prompt)



- **Playing Career:**
  - Debuted for Bihar in first-class cricket (1999)
  - Made ODI debut for India against Bangladesh (2004), Test debut against Sri Lanka (2005)
  - Retired from Tests (2014), limited overs career ended in 2019
  - Scored over 17,000 international runs; 10,000+ ODI runs at an average above 50

- **Captaincy:**
  - Led India to victory in ICC tournaments: WT20 (2007), CWC (2011), CT (2013)
  - Led Chennai Super Kings (CSK) to IPL final 10 times, winning 5 titles (2010, 2011, 2018, 2021, 2023)

- **Awards and Recognitions:**
  - Major Dhyan Chand Khel Ratna Award (2007)
  - Padma Shri (2008), Padma Bhushan (2018)
  - Honorary rank of Lieutenant Colonel in Indian Territorial Army (2011)


Execution Time 10.450191259384155 


In [16]:
#Summary RAG

prompt="In about 150 words, give an overview of Sourav's career, answer in numbered bullet points"
rag_function(prompt)



- **Early Career**: Sourav Ganguly started his international cricket career in 1992 against England.
- **Captaincy Debut**: He became the captain of the Indian national team in 2000, replacing Mohammad Azharuddin.
- **Major Achievements**:
  - Led India to win the 2002 ICC Champions Trophy.
  - Reached the final of the 2003 Cricket World Cup and was named the Man of the Series.
  - Also reached the finals of the 2000 ICC Champions Trophy and the 2004 Asia Cup under his captaincy.


Execution Time 6.488038063049316 


In [17]:
#Analytical RAG

prompt="Enlist the total runs scored by all three players. Present the answer in ordered list in decreasing order"
rag_function(prompt)



- Sachin Tendulkar: 34,357 runs
- MS Dhoni: >5000 runs (IPL)
- Sourav Ganguly: 11363 runs


Execution Time 3.186295747756958 
