# Loading Data

Reference - https://python.langchain.com/docs/tutorials/rag/

In [31]:
!git clone https://github.com/CalvQ/RAGnaRec.git
import os

fatal: destination path 'RAGnaRec' already exists and is not an empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
!pip install --quiet --upgrade langchain langchain-community langchain-chroma
!pip install -U langchain langchain-openai
!pip install sentence-transformers langchain chromadb

Collecting langchain-openai
  Downloading langchain_openai-0.2.9-py3-none-any.whl.metadata (2.6 kB)
Collecting openai<2.0.0,>=1.54.0 (from langchain-openai)
  Downloading openai-1.55.0-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=1.54.0->langchain-openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai<2.0.0,>=1.54.0->langchain-openai)
  Downloading jiter-0.7.1-cp310-cp310-macosx_10_12_x86_64.whl.metadata (5.2 kB)
Downloading langchain_openai-0.2.9-py3-none-any.whl (50 kB)
Downloading openai-1.55.0-py3-none-any.whl (389 kB)
Downloading tiktoken-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)






Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [4]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [11]:

# Import packages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import random

import getpass
import os

# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from transformers import pipeline

In [6]:
!pip install pyarrow fastparquet huggingface_hub matplotlib seaborn



In [8]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

In [16]:
df = pd.concat([df_train, df_test], ignore_index=True)
index_list = range(len(df))
random.seed(10701)
indices = random.sample(index_list, 500)
df = df.iloc[indices] # using only 10,000 rows for PoC to make training quicker

# Retrieval Component

In [17]:
loader = DataFrameLoader(df, page_content_column = "text")

In [18]:
# load documents
docs = loader.load()
# split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
splits = text_splitter.split_documents(docs)

In [19]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents = splits, embedding = embeddings)
retriever = vectorstore.as_retriever(search_type = 'similarity')

In [20]:
# HuggingFace pipline
llm_pipeline = pipeline("text-generation", model = "distilgpt2")

In [21]:
prompt = "Great Italian restaurant with excellent pasta"
retrieved_docs = retriever.get_relevant_documents(prompt)
user_review = prompt

  retrieved_docs = retriever.get_relevant_documents(prompt)


In [22]:
# Display retrieved results
context = ''
print("Retrieved reviews:")
for i, doc in enumerate(retrieved_docs[:5]):
    print(f"Review {i+1}:\n{doc.page_content}\n")
    context += f"Review {i+1}:\n{doc.page_content}\n"

Retrieved reviews:
Review 1:
Consequently, one MUST sit in the dining area for the corresponding restaurant. \n\nNow onto the food.  \n\nThe wine list is extensive but only includes Italian selections.  Most of the servers are knowledgeable and can recommend pairings. \n\nThe appetizers are decent but are not worth the ~$10-$15 cost.  \n\nI've sampled both the pastas and the pizzas.  The pastas are underwhelming in terms of originality, taste, and size. Entrees come with bread and oil for dipping, but the bread is too salty for my enjoyment.  I will never understand why carb-loaded restaurants serve bread with their offerings.  It seems redundant. \n\nPizzas are the restaurant's specialty (at a place called Il Pizzaiolo? No way!).  The wood-fired pies are good, and vary from the standard Margherita  to the more loaded Santa Lucia.  But to be honest, all of these pizzas lack imagination. I understand that Pizzaiolo is going for more \"classic\" recipes but for the prices why not throw i

In [23]:
response_prompt = (
    "Based on the following reviews, generate a concise recommendation for a great Italian restaurant with excellent pasta. "
    "The recommendation should mention one restaurant name, the highlights, and why it stands out:\n\n"
    f"{retrieved_docs}\n\n"  # Include retrieved reviews for context
    "Please generate a clear and friendly recommendation."
)

In [24]:
generated_response = llm_pipeline(response_prompt, max_new_tokens=50, num_return_sequences=1)
print("Generated recommendation:")
print('Generated response: ', generated_response[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated recommendation:
Generated response:  Based on the following reviews, generate a concise recommendation for a great Italian restaurant with excellent pasta. The recommendation should mention one restaurant name, the highlights, and why it stands out:

[Document(metadata={'label': 1}, page_content='Consequently, one MUST sit in the dining area for the corresponding restaurant. \\n\\nNow onto the food.  \\n\\nThe wine list is extensive but only includes Italian selections.  Most of the servers are knowledgeable and can recommend pairings. \\n\\nThe appetizers are decent but are not worth the ~$10-$15 cost.  \\n\\nI\'ve sampled both the pastas and the pizzas.  The pastas are underwhelming in terms of originality, taste, and size. Entrees come with bread and oil for dipping, but the bread is too salty for my enjoyment.  I will never understand why carb-loaded restaurants serve bread with their offerings.  It seems redundant. \\n\\nPizzas are the restaurant\'s specialty (at a place

In [25]:
print((generated_response)[0])

{'generated_text': 'Based on the following reviews, generate a concise recommendation for a great Italian restaurant with excellent pasta. The recommendation should mention one restaurant name, the highlights, and why it stands out:\n\n[Document(metadata={\'label\': 1}, page_content=\'Consequently, one MUST sit in the dining area for the corresponding restaurant. \\\\n\\\\nNow onto the food.  \\\\n\\\\nThe wine list is extensive but only includes Italian selections.  Most of the servers are knowledgeable and can recommend pairings. \\\\n\\\\nThe appetizers are decent but are not worth the ~$10-$15 cost.  \\\\n\\\\nI\\\'ve sampled both the pastas and the pizzas.  The pastas are underwhelming in terms of originality, taste, and size. Entrees come with bread and oil for dipping, but the bread is too salty for my enjoyment.  I will never understand why carb-loaded restaurants serve bread with their offerings.  It seems redundant. \\\\n\\\\nPizzas are the restaurant\\\'s specialty (at a pla

# Generation

Look into ChatPrompTemplate to make a custom prompt https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html

Models:
Google Gemma [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it)

In [26]:
YELP_PROMPT_TEMPLATE = f"""The following are customer reviews of a business:

BEGIN REVIEWS

{context}

END REVIEWS

Based on the above reviews, provide a personalized recommendation to a user who wrote this review: "{user_review}".

Follow the guidelines below when answering the question:
1. If something is mentioned in multiple reviews, it is important and more likely to be preferred in the answer.
2. If the reviews are of a restaurant, bar, or any food establishment, use specific names of dishes, drinks, and desserts.
"""

print(YELP_PROMPT_TEMPLATE)


def get_template(user_review, context):

  YELP_PROMPT_TEMPLATE = f"""The following are customer reviews of a business:

  BEGIN REVIEWS

  {context}

  END REVIEWS

  Based on the above reviews, provide a personalized recommendation to a user who wrote this review: "{user_review}".

  Follow the guidelines below when answering the question:
  1. If something is mentioned in multiple reviews, it is important and more likely to be preferred in the answer.
  2. If the reviews are of a restaurant, bar, or any food establishment, use specific names of dishes, drinks, and desserts.
  """


  return (YELP_PROMPT_TEMPLATE)


The following are customer reviews of a business:

BEGIN REVIEWS

Review 1:
Consequently, one MUST sit in the dining area for the corresponding restaurant. \n\nNow onto the food.  \n\nThe wine list is extensive but only includes Italian selections.  Most of the servers are knowledgeable and can recommend pairings. \n\nThe appetizers are decent but are not worth the ~$10-$15 cost.  \n\nI've sampled both the pastas and the pizzas.  The pastas are underwhelming in terms of originality, taste, and size. Entrees come with bread and oil for dipping, but the bread is too salty for my enjoyment.  I will never understand why carb-loaded restaurants serve bread with their offerings.  It seems redundant. \n\nPizzas are the restaurant's specialty (at a place called Il Pizzaiolo? No way!).  The wood-fired pies are good, and vary from the standard Margherita  to the more loaded Santa Lucia.  But to be honest, all of these pizzas lack imagination. I understand that Pizzaiolo is going for more \"class

In [32]:
os.chdir(f'./RAGnaREC')
!git pull

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 14 (delta 9), reused 14 (delta 9), pack-reused 0 (from 0)[K
Unpacking objects: 100% (14/14), 1.30 MiB | 7.82 MiB/s, done.
From https://github.com/CalvQ/RAGnaRec
   01e13b6..f7d6d9e  master     -> origin/master
Updating 01e13b6..f7d6d9e
Fast-forward
 clustering/GuidedLDA.ipynb                      | 297 [32m+++++++++[m[31m---------------[m
 clustering/results/model_7_1000_7_20_0.5.pickle | Bin [31m0[m -> [32m2346999[m bytes
 clustering/results/result_15_500_7_20_0.15.txt  | 122 [32m++++++++++[m
 clustering/results/result_15_500_7_20_0.5.txt   | 128 [32m++++++++++[m
 clustering/results/result_15_500_7_20_0.85.txt  | 128 [32m++++++++++[m
 clustering/results/result_25_500_7_20_0.15.txt  | 202 [32m++++++++++++++++[m
 6 files changed, 692 insertions(+), 185 deletions(-)
 create mode 100644 clustering/results/model_7_1000_7

In [35]:
import pickle as pkl

In [36]:
with open('clustering/results/model_7_1000_7_20_0.5.pickle', 'rb') as file:
    model = pkl.load(file)

In [38]:
def create_matrix(processed_text, vocabulary=vocab_list):
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    dtm = vectorizer.fit_transform(processed_text)
    return dtm, vectorizer.vocabulary_

NameError: name 'vocab_list' is not defined

In [37]:
model.transform(create_matrix("great italian restaurant excellent pasta"))

NameError: name 'create_matrix' is not defined

In [30]:
!huggingface-cli login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/Users/pranav/opt/miniconda3/envs/ragnarec/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/pranav/opt/miniconda3/envs/ragnarec/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py", 

In [28]:
# from langchain import HuggingFaceHub
# from langchain import PromptTemplate, LLMChain


# prompt = PromptTemplate(template=YELP_PROMPT_TEMPLATE, input_variables=["context", "user_review"])

# os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass()
# llm_chain = LLMChain(prompt=prompt,
#                      llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":1e-10, "max_length": 100}))

# question = "Great Italian restaurant with excellent pasta"
# response = llm_chain.run(user_review=user_review, context=context)
# print(response)

In [29]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    # device="cuda",  # replace with "mps" to run on a Mac device
)



OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2-2b-it.
401 Client Error. (Request ID: Root=1-673e62a0-41c977851927a4c628ce8c4a;e17ffb78-2889-4c4f-b5cb-d1dff9be289e)

Cannot access gated repo for url https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json.
Access to model google/gemma-2-2b-it is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
messages = [
    {"role": "user", "content": YELP_PROMPT_TEMPLATE},
]

outputs = pipe(messages, max_new_tokens=256)
assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
print(assistant_response)


# **Testing End-to-End Conversational Review Generation**

User review --> Find similar reviews (context) --> Fill in template with review, context --> Pass to model

In [None]:
def get_prompt(user_review):
  retrieved_docs = retriever.get_relevant_documents(user_review)
  context = ''
  for i, doc in enumerate(retrieved_docs[:5]):
      context += f"Review {i+1}:\n{doc.page_content}\n"
  return get_template(user_review, context)

good_review = 'Great Italian restaurant with excellent pasta'
# bad_review = "I hate Italian food. I hate pizza. Cheese stinks. Pasta is disgusting. The service at the Italian restaurant sucked. I wish I never have to eat Italian food in my life again."
messages = [
    {"role": "user", "content": get_prompt(bad_review)},
]

outputs = pipe(messages, max_new_tokens=256)

assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
print(assistant_response)
