In [6]:
import pandas as pd
import numpy as np
import csv
import sys

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma 
from langchain.document_loaders.csv_loader import CSVLoader

In [None]:
df = pd.read_csv(r'C:\Users\daksh\Downloads\merged_sorted_data.csv')
df.shape

In [None]:
%%time
# creates a hashmap to aggregate multiple reviews associated with each business.
business_data = {}
for _, row in df.iterrows():
    business_id = row['business_id']
    
    if business_id not in business_data:
       
        business_data[business_id] = {
            'name': row['name'],
            'address': row['address'],
            'city': row['city'],
            'state': row['state'],
            'postal_code': row['postal_code'],
            'latitude': row['latitude'],
            'longitude': row['longitude'],
            'stars_x': [row['stars_x']] if pd.notnull(row['stars_y']) else 0, 
            'review_count': row['review_count'],
            'is_open': row['is_open'],
            'attributes': row['attributes'],  
            'categories': row['categories'],  
            'hours': row['hours'],  
            'user_reviews': [row['text']], 
            'stars_y': [row['stars_y']] if pd.notnull(row['stars_y']) else 0
        }
    else:
        if pd.notnull(row['stars_x']):
            business_data[business_id]['stars_x'].append(row['stars_x'])
        
        if pd.notnull(row['stars_y']):
            business_data[business_id]['stars_y'].append(row['stars_y'])
       
        business_data[business_id]['user_reviews'].append((row['text']))     

In [None]:
# creates a dataframe from hashmap for mapping.
aggregated_df = pd.DataFrame.from_dict(business_data, orient='index')

In [None]:
# calculates the mean for business ratings
def calculate_mean(stars_list):
    if stars_list: 
        return round(sum(stars_list) / len(stars_list), 2)
    else:
        return None 
aggregated_df['stars_x'] = aggregated_df['stars_x'].apply(calculate_mean)
aggregated_df['stars_y'] = aggregated_df['stars_y'].apply(calculate_mean)

In [None]:
# saving csv file
aggregated_df.to_csv('aggregated_data.csv', index=False)

In [None]:
# loading csv file and splitting it in to chunks
csv.field_size_limit(sys.maxsize)

loader =  CSVLoader(file_path = r"C:\Users\daksh\Downloads\Recsys_using_RAG_ChatBot-main\aggregated_data.csv", encoding='UTF-8')
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

In [None]:
# Text embeddings using sentence Transformer for CSV file
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

model_args = {'device':'cpu'}

encode_args = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     
    model_kwargs=model_args, 
    encode_kwargs=encode_args)

In [None]:
%%time
db = Chroma.from_documents(docs, embeddings)

In [None]:
query = "I want to have a vegan food around santa Barbara"

In [None]:
retriever = db.as_retriever(search_type="mmr")
results = retriever.get_relevant_documents(query, num_results=7)
results

In [None]:
%%time
db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db_1")
db2.persist()

In [None]:
db3 = Chroma(embedding_function= embeddings,persist_directory="./chroma_db_1")
result = db3.similarity_search_with_score(query, 5)[::-1]

In [None]:
docs = db3.similarity_search(query,5)[::-1]
docs

In [None]:
df_ret = pd.read_csv('C:\Users\daksh\Downloads\Recsys_using_RAG_ChatBot-main\aggregated_data.csv')

In [None]:
from langchain import PromptTemplate

# Define the PromptTemplate
prompt_template = PromptTemplate(
    template="""Business Review:\n"""
              """Name: {Name}\n"""
              """Address: {Address}, {City}, {State}, {PostalCode}\n"""
              """Hours: {Hours}\n"""
              """Rating: {Stars} stars\n""",
    input_variables=["Name", "Address", "City", "State", "PostalCode", "Hours", "Stars"]
)

combined_reviews = ""

# Iterate through documents
for i in range(len(docs)):
    row_value = docs[i].metadata.get('row', None)

    if row_value is not None:
        # Extracting data from the DataFrame
        data = {
            "Name": df_ret.iloc[row_value]['name'],
            "Address": df_ret.iloc[row_value]['address'],
            "City": df_ret.iloc[row_value]['city'],
            "State": df_ret.iloc[row_value]['state'],
            "PostalCode": df_ret.iloc[row_value]['postal_code'],
            "Hours": df_ret.iloc[row_value]['hours'],
            "Stars": df_ret.iloc[row_value]['stars_y']
        }

        # Format the prompt and append to combined_reviews
        combined_reviews += prompt_template.format(**data) + "\n"

# Append the instruction at the end of the combined reviews
final_prompt = combined_reviews + "You are a smart recommender system, Please provide a recommendation based on this business information.\nRecommend places from suggested additional context only and from file aggregated_data.csv \nDo not suggest places on your own\n Do not mention aggregated_data.csv file in your response and your response must suggest all Business Reviews included in prompt"

print(final_prompt)


In [None]:
import requests
def nvidia_api_call(query, api_key, invoke_url, fetch_url_format):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
    }

    payload = {
        "messages": [
            {
                "content": query,
                "role": "user"
            }
        ],
        "temperature": 0.2,
        "top_p": 0.7,
        "max_tokens": 1024,
        "stream": False
    }

    session = requests.Session()
    response = session.post(invoke_url, headers=headers, json=payload)

    while response.status_code == 202:
        request_id = response.headers.get("NVCF-REQID")
        fetch_url = fetch_url_format + request_id
        response = session.get(fetch_url, headers=headers)

    response.raise_for_status()
    response_body = response.json()
    return response_body['choices'][0]['message']['content']

In [None]:
api_key = "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"
invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0e349b44-440a-44e1-93e9-abe8dcb27158"
fetch_url_format = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/"

In [None]:
recommendations = nvidia_api_call(final_prompt, api_key, invoke_url, fetch_url_format)
print(recommendations)