# Loading Data

Reference - https://python.langchain.com/docs/tutorials/rag/

In [None]:
!pip install --quiet --upgrade langchain langchain-community langchain-chroma
!pip install -U langchain langchain-openai
!pip install sentence-transformers langchain chromadb



In [None]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

import getpass
import os

# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from transformers import pipeline

In [None]:
!pip install pyarrow fastparquet huggingface_hub matplotlib seaborn



In [None]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
df = pd.concat([df_train, df_test], ignore_index=True)
df = df.sample(n = 1000, random_state = 43)

# Retrieval Component

In [None]:
loader = DataFrameLoader(df, page_content_column = "text")

In [None]:
# load documents
docs = loader.load()
# split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
splits = text_splitter.split_documents(docs)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents = splits, embedding = embeddings)
retriever = vectorstore.as_retriever(search_type = 'similarity')

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# HuggingFace pipline
llm_pipeline = pipeline("text-generation", model = "distilgpt2")



In [None]:
prompt = "Great Italian restaurant with excellent pasta"
retrieved_docs = retriever.get_relevant_documents(prompt)


  retrieved_docs = retriever.get_relevant_documents(prompt)


In [None]:
# Display retrieved results
print("Retrieved reviews:")
for i, doc in enumerate(retrieved_docs[:5]):
    print(f"Review {i+1}:\n{doc.page_content}\n")

Retrieved reviews:
Review 1:
Sauce by Chef Boyardee. And that is NOT a compliment. The evening started okay. We did not have reservations, but we were early enough and they found a spot for us. The service was impeccable which is why this restaurant has been given 2 stars instead of none. \n\nWe started with champagne. We did not want a full bottle as we were headed to the theater.   The house brand was on the sweet side but okay. Then they brought out the stale bread with the good white beans and olive mixture. The bean stuff was okay but it didn't make up for the lackluster dry bread. I expect so much more from a good Italian. Then we ordered the cheese plate and a sausage plate.   We only raved about one cheese. All of the hard salami choices were good, but it is hard to mess-up that. \n\nThen came the worst main courses ever!  We ordered the pasta because we had heard such great reviews. I had the gnocchi and my friend had the ravioli. The pasta itself was good, but the sauce ruine

In [None]:
response_prompt = (
    "Based on the following reviews, generate a concise recommendation for a great Italian restaurant with excellent pasta. "
    "The recommendation should mention one restaurant name, the highlights, and why it stands out:\n\n"
    f"{retrieved_docs}\n\n"  # Include retrieved reviews for context
    "Please generate a clear and friendly recommendation."
)

In [None]:
generated_response = llm_pipeline(response_prompt, max_new_tokens=50, num_return_sequences=1)
print("Generated recommendation:")
print(generated_response[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated recommendation:
Based on the following reviews, generate a concise recommendation for a great Italian restaurant with excellent pasta. The recommendation should mention one restaurant name, the highlights, and why it stands out:

[Document(metadata={'label': 1}, page_content="Sauce by Chef Boyardee. And that is NOT a compliment. The evening started okay. We did not have reservations, but we were early enough and they found a spot for us. The service was impeccable which is why this restaurant has been given 2 stars instead of none. \\n\\nWe started with champagne. We did not want a full bottle as we were headed to the theater.   The house brand was on the sweet side but okay. Then they brought out the stale bread with the good white beans and olive mixture. The bean stuff was okay but it didn't make up for the lackluster dry bread. I expect so much more from a good Italian. Then we ordered the cheese plate and a sausage plate.   We only raved about one cheese. All of the hard

# Generation

Look into ChatPrompTemplate to make a custom prompt https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html
