In [1]:
import pandas as pd

df = pd.read_csv("../data/yelp.csv")  
df.head()


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [2]:
df.columns


Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')

In [3]:
df_sample = df[['text', 'stars']].sample(200, random_state=42).reset_index(drop=True)
df_sample.head()


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [4]:
import json
from jsonschema import validate, ValidationError

rating_schema = {
    "type": "object",
    "properties": {
        "predicted_stars": {"type": "integer", "minimum": 1, "maximum": 5},
        "explanation": {"type": "string"}
    },
    "required": ["predicted_stars", "explanation"]
}

def validate_json_response(response_str):
    try:
        obj = json.loads(response_str)
        validate(instance=obj, schema=rating_schema)
        return True, obj
    except Exception as e:
        return False, str(e)


In [5]:
prompt_zero_shot = f"""
You are a sentiment and rating classification model.

Given a Yelp review, identify the most likely star rating from 1 to 5.

Return ONLY a JSON object in this format:
{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "Short explanation here"
}}

Review:
<<<{{review_text}}>>>

Respond ONLY with valid JSON.
"""




In [6]:
import google.generativeai as genai
print("Gemini SDK imported!")



Gemini SDK imported!


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

# Load API key from .env (not uploaded to GitHub)
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key)

def call_gemini(prompt):
    model = genai.GenerativeModel("models/gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text



In [None]:
model = genai.GenerativeModel("models/gemini-flash-latest")
response = model.generate_content("Hello!")
print(response.text)

In [None]:
sample_review = df_sample.loc[0, "text"]

prompt = prompt_zero_shot.replace("{review_text}", sample_review)

response = call_gemini(prompt)

print("RAW OUTPUT START -------------------")
print(response)
print("RAW OUTPUT END ---------------------")

In [11]:
def clean_llm_output(text):
    # Remove code fences if they exist
    text = text.strip()
    text = text.replace("```json", "")
    text = text.replace("```", "")
    text = text.strip()
    return text


In [None]:


cleaned = clean_llm_output(response)
print("Cleaned Output:\n", cleaned)

is_valid, parsed = validate_json_response(cleaned)
print("Valid JSON:", is_valid)
print("Parsed Output:", parsed)


In [40]:
prompt_few_shot = f"""
You are a sentiment and rating classifier for Yelp reviews.

Below are example labeled reviews:

Example 1:
Review: "The food was cold and the service was rude. I won't come back."
Label: 1

Example 2:
Review: "It was okay. Not great, not terrible."
Label: 3

Example 3:
Review: "Absolutely amazing! The staff was friendly and the food was perfect."
Label: 5

Now classify this new review. Return ONLY valid JSON:
{{
  "predicted_stars": <integer>,
  "explanation": "Short explanation"
}}

Review:
<<<{{review_text}}>>>

Respond ONLY in JSON.
"""


In [41]:
sample_review = df_sample.loc[1, "text"]

prompt = prompt_few_shot.replace("{review_text}", sample_review)
response = call_gemini(prompt)

print("RAW OUTPUT ----------------")
print(response)
print("---------------------------")


RAW OUTPUT ----------------
{
  "predicted_stars": 5,
  "explanation": "The review is overwhelmingly positive, citing extremely high praise for the dish from someone with specialized knowledge of the cuisine."
}
---------------------------


In [42]:
cleaned = clean_llm_output(response)
is_valid, parsed = validate_json_response(cleaned)

print("Valid JSON:", is_valid)
print("Parsed Output:", parsed)


Valid JSON: True
Parsed Output: {'predicted_stars': 5, 'explanation': 'The review is overwhelmingly positive, citing extremely high praise for the dish from someone with specialized knowledge of the cuisine.'}


In [43]:
prompt_reasoning = f"""
You are an assistant that classifies Yelp reviews into a star rating (1 to 5).

Think step by step:
1. Identify whether the review sentiment is negative, neutral, or positive.
2. Identify important clues (food, service, experience).
3. Estimate how strong the sentiment is (weak, moderate, strong).
4. Choose the best rating from 1–5.

Then return ONLY a JSON object:
{{
  "predicted_stars": <integer>,
  "explanation": "Short explanation summarizing your reasoning"
}}

Do NOT include markdown or code blocks. Respond with JSON only.

Review:
<<<{{review_text}}>>>
"""


In [44]:
sample_review = df_sample.loc[2, "text"]

prompt = prompt_reasoning.replace("{review_text}", sample_review)
response = call_gemini(prompt)

print("RAW OUTPUT ----------------")
print(response)
print("---------------------------")


RAW OUTPUT ----------------
{"predicted_stars": 4, "explanation": "The review is strongly positive, emphasizing that the food is 'good and filling' and 'hits the spot.' The fact that the patrons return every week demonstrates strong customer loyalty and consistent satisfaction, classifying the experience as very good (4 stars), despite the quality being described as 'Typical strip mall pizza' (suggesting good reliability rather than gourmet excellence)."}
---------------------------


In [45]:
cleaned = clean_llm_output(response)
is_valid, parsed = validate_json_response(cleaned)

print("Valid JSON:", is_valid)
print("Parsed Output:", parsed)


Valid JSON: True
Parsed Output: {'predicted_stars': 4, 'explanation': "The review is strongly positive, emphasizing that the food is 'good and filling' and 'hits the spot.' The fact that the patrons return every week demonstrates strong customer loyalty and consistent satisfaction, classifying the experience as very good (4 stars), despite the quality being described as 'Typical strip mall pizza' (suggesting good reliability rather than gourmet excellence)."}


In [46]:
def evaluate_model(df, prompt_template):
    results = []
    for i, row in df.iterrows():
        review_text = row["text"]
        true_rating = row["stars"]

        # Build prompt
        prompt = prompt_template.replace("{review_text}", review_text)

        # Get model response
        response = call_gemini(prompt)
        cleaned = clean_llm_output(response)

        # Validate + parse JSON
        is_valid, parsed = validate_json_response(cleaned)

        if is_valid:
            pred = parsed["predicted_stars"]
            explanation = parsed["explanation"]
        else:
            pred = None
            explanation = "Invalid JSON"

        # Store results
        results.append({
            "review": review_text,
            "true_rating": true_rating,
            "predicted_rating": pred,
            "explanation": explanation
        })

    return pd.DataFrame(results)


In [54]:
def evaluate_in_batches(df, prompt_template, batch_size=10):
    all_results = []
    total = len(df)

    print(f"Evaluating {total} rows in batches of {batch_size}...")

    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        batch = df.iloc[start:end]

        print(f"\nProcessing rows {start} to {end-1} ...")

        batch_results = evaluate_model(batch, prompt_template)
        all_results.append(batch_results)

        # Sleep after each batch to avoid rate limits
        import time
        time.sleep(3)

    return pd.concat(all_results, ignore_index=True)


In [48]:
df_small = df_sample.head(50)
df_small.shape


(50, 2)

In [52]:
import time

def evaluate_model(df, prompt_template):
    results = []
    for i, row in enumerate(df.itertuples(), start=1):
        review_text = row.text
        true_rating = row.stars

        prompt = prompt_template.replace("{review_text}", review_text)

        # Retry logic if rate limit occurs
        while True:
            response = call_gemini(prompt)
            
            # If rate-limited
            if "ResourceExhausted" in response or "quota" in response.lower():
                print(f"Rate limit hit at row {i}. Waiting 30 seconds...")
                time.sleep(30)
            else:
                break

        cleaned = clean_llm_output(response)
        is_valid, parsed = validate_json_response(cleaned)

        pred = parsed["predicted_stars"] if is_valid else None
        explanation = parsed["explanation"] if is_valid else "Invalid JSON"

        results.append({
            "review": review_text,
            "true_rating": true_rating,
            "predicted_rating": pred,
            "explanation": explanation
        })

        # >>> KEY FIX: slow down requests  
        time.sleep(1.5)

    return pd.DataFrame(results)


In [56]:
def call_gemini(prompt):
    model = genai.GenerativeModel("models/gemini-flash-latest")

    while True:
        try:
            response = model.generate_content(prompt)
            return response.text
        
        except Exception as e:
            err = str(e)

            # Handle rate limits / Resource Exhausted
            if "ResourceExhausted" in err or "quota" in err.lower():
                print("⚠️ Rate limit hit. Waiting 25 seconds...")
                import time
                time.sleep(25)
                continue  # retry

            # Handle other errors
            return f"Error: {err}"
