<a href="https://colab.research.google.com/github/Darshit2003/RAG_Learning/blob/main/RAG_implementation_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
corpus_document = [
    "The quick brown fox jumps over the lazy dog.",
    "Natural language processing is a field of artificial intelligence.",
    "AI and machine learning are revolutionizing data analysis.",
    "A dog is a man's best friend.",
    "Data science students learn Python and R.",
    "The fox lives in a den in the forest."
]

In [2]:
corpus_document

['The quick brown fox jumps over the lazy dog.',
 'Natural language processing is a field of artificial intelligence.',
 'AI and machine learning are revolutionizing data analysis.',
 "A dog is a man's best friend.",
 'Data science students learn Python and R.',
 'The fox lives in a den in the forest.']

In [3]:
user_query = "I am an indian and i live in india"

In [4]:
document = "India is a country for indians and for everyone"

In [5]:
from collections import Counter
import math

In [6]:
query_tokens = user_query.lower().split(" ")
query_tokens

['i', 'am', 'an', 'indian', 'and', 'i', 'live', 'in', 'india']

In [7]:
document_tokens = document.lower().split(" ")
document_tokens

['india', 'is', 'a', 'country', 'for', 'indians', 'and', 'for', 'everyone']

In [8]:
query_counter = Counter(query_tokens)
query_counter

Counter({'i': 2,
         'am': 1,
         'an': 1,
         'indian': 1,
         'and': 1,
         'live': 1,
         'in': 1,
         'india': 1})

In [9]:
document_counter = Counter(document_tokens)
document_counter

Counter({'india': 1,
         'is': 1,
         'a': 1,
         'country': 1,
         'for': 2,
         'indians': 1,
         'and': 1,
         'everyone': 1})

In [10]:
my_list = []
for tokens in query_counter.keys():
    my_list.append(query_counter[tokens])

In [11]:
my_list

[2, 1, 1, 1, 1, 1, 1, 1]

In [12]:
for tokens in query_counter.keys() & document_counter.keys():
    print(tokens)

and
india


In [13]:
my_list2 = []
for tokens in query_counter.keys() & document_counter.keys():
    my_list2.append(query_counter[tokens] * document_counter[tokens])
my_list2

[1, 1]

In [14]:
dot_prod = sum(my_list2)
dot_prod

2

In [15]:
query_magnitude = math.sqrt(sum(query_counter[tokens] ** 2 for tokens in query_counter))


In [16]:
query_magnitude

3.3166247903554

In [17]:
document_magnitude = math.sqrt(sum(document_counter[tokens] ** 2 for tokens in document_counter))
document_magnitude

3.3166247903554

In [18]:
similarity_score = (dot_prod / (query_magnitude * document_magnitude))
similarity_score

0.18181818181818182

In [19]:
def cosine_similarity(query, document):
    # Tokenize and convert to lowercase
    query_tokens = query.lower().split(" ")
    document_tokens = document.lower().split(" ")

    # Create Counters for query and document
    query_counter = Counter(query_tokens)
    document_counter = Counter(document_tokens)

    # Calculate dot product
    dot_product = sum(query_counter[token] * document_counter[token] for token in query_counter.keys() & document_counter.keys())

    # Calculate magnitudes
    query_magnitude = math.sqrt(sum(query_counter[token] ** 2 for token in query_counter))
    document_magnitude = math.sqrt(sum(document_counter[token] ** 2 for token in document_counter))

    # Calculate cosine similarity
    similarity = dot_product / (query_magnitude * document_magnitude) if query_magnitude * document_magnitude != 0 else 0

    return similarity

In [20]:
user_query="is yoga good for health?"
document="yoga is bad for health"
cosine_similarity(user_query, document)

0.5999999999999999

In [21]:
def return_response(query, corpus):
    similarities = []
    for doc in corpus:
        similarity = cosine_similarity(query, doc)
        similarities.append(similarity)
    return corpus_document[similarities.index(max(similarities))]

In [22]:
corpus_document

['The quick brown fox jumps over the lazy dog.',
 'Natural language processing is a field of artificial intelligence.',
 'AI and machine learning are revolutionizing data analysis.',
 "A dog is a man's best friend.",
 'Data science students learn Python and R.',
 'The fox lives in a den in the forest.']

In [23]:
user_input = "NLP and AI?"

In [24]:
relevant_document=return_response(user_input,corpus_document)

In [25]:
relevant_document

'Data science students learn Python and R.'

In [26]:
import requests
import json
full_response = []

#Here to run this Augmentation code download ollama then do setup and then on cmd run - ollama run llama2. By default ollama will run on 'http://localhost:11434/api/generate'.

In [27]:

full_response = []
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

url = 'http://localhost:11434/api/generate'


data = {
    "model": "llama2",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}

headers = {'Content-Type': 'application/json'}

response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)


try:
    for line in response.iter_lines():
        # filter out keep-alive new lines
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])  # uncomment to results, token by token
            full_response.append(decoded_line['response'])
finally:
    response.close()


print(''.join(full_response))