In [1]:
import pandas as pd
import google.generativeai as genai
import json
import time
import re
from sklearn.metrics import accuracy_score, classification_report

# --- CONFIGURATION ---
# Replace with your actual Gemini API Key from Google AI Studio
GOOGLE_API_KEY = "AIzaSyAE-Stt-y-c3up-GAg5YE0F2og2x-eYZks" 

genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-flash-lite-latest')

print("Libraries imported and API configured.")

Libraries imported and API configured.


In [2]:
# 1. Load the dataset
try:
    # Attempt to read the CSV. 
    # Note: If your file is named something else, change 'yelp.csv' below.
    df = pd.read_csv('yelp.csv')
    print(f"Dataset loaded. Total rows: {len(df)}")
except FileNotFoundError:
    print("‚ùå ERROR: 'yelp.csv' not found. Please download the dataset and place it in this folder.")
    df = pd.DataFrame() # Create empty to prevent crash in next steps

# 2. Sample 200 rows (Requirement: ~200 rows)
if not df.empty:
    # We use .copy() to ensure we have a clean standalone dataframe
    df_sampled = df.sample(n=50, random_state=42).reset_index(drop=True)
    
    # 3. Verify column names (Yelp datasets usually have 'stars' and 'text')
    print("Columns found:", df_sampled.columns.tolist())
    
    # Show first 3 rows to check data
    print("\nSample Data:")
    print(df_sampled[['stars', 'text']].head(3))
else:
    print("Cannot proceed without data.")

Dataset loaded. Total rows: 10000
Columns found: ['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id', 'cool', 'useful', 'funny']

Sample Data:
   stars                                               text
0      4  We got here around midnight last Friday... the...
1      5  Brought a friend from Louisiana here.  She say...
2      3  Every friday, my dad and I eat here. We order ...


In [3]:
def extract_json(text):
    """
    Finds a JSON object inside a string using Regex.
    Solves the issue where LLM adds markdown like ```json ... ```
    """
    try:
        # Regex to find text between { and } across multiple lines
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            json_str = match.group(0)
            return json.loads(json_str)
    except Exception as e:
        return None
    return None

def get_prediction(review_text, strategy):
    """
    Sends the review to Gemini using one of 3 strategies.
    Returns: JSON object with 'predicted_stars' and 'explanation'
    """
    
    base_instruction = """
    Analyze the sentiment of this Yelp review.
    Determine the rating (1 to 5 stars).
    Output strictly VALID JSON in this format:
    {"predicted_stars": <int>, "explanation": "<short string>"}
    """
    
    # --- STRATEGY 1: ZERO-SHOT ---
    if strategy == "zero_shot":
        prompt = f"{base_instruction}\n\nReview: {review_text}"

    # --- STRATEGY 2: FEW-SHOT (Providing examples) ---
    elif strategy == "few_shot":
        examples = """
        Examples:
        Review: "The service was slow and food was cold." -> {"predicted_stars": 1, "explanation": "Negative sentiment regarding service and food."}
        Review: "Absolutely delicious! Best pizza in town." -> {"predicted_stars": 5, "explanation": "Highly positive review."}
        """
        prompt = f"{base_instruction}\n{examples}\n\nReview: {review_text}"

    # --- STRATEGY 3: CHAIN-OF-THOUGHT (Step-by-step reasoning) ---
    elif strategy == "chain_of_thought":
        prompt = f"""
        You are an expert critic. Follow these steps:
        1. Identify the key adjectives in the text.
        2. Determine if the tone is angry, neutral, or happy.
        3. Assign a score from 1-5.
        
        {base_instruction}
        
        Review: {review_text}
        """
    
    # Call the API
    try:
        response = model.generate_content(prompt)
        result = extract_json(response.text)
        
        # Validation: Check if we got a star rating
        if result and 'predicted_stars' in result:
            return result
        else:
            return {"predicted_stars": 0, "explanation": "Error: JSON parsing failed"}
            
    except Exception as e:
        return {"predicted_stars": 0, "explanation": f"API Error: {str(e)}"}

print("Function defined successfully.")

Function defined successfully.


In [4]:
results = []
# Ensure df_sampled is actually defined (use the 50 rows version if possible)
print(f"Starting processing of {len(df_sampled)} rows...") 

if not df_sampled.empty:
    for index, row in df_sampled.iterrows():
        actual_stars = row['stars']
        review_text = row['text']
        
        # Test all 3 strategies for this single review
        for strategy in ["zero_shot", "few_shot", "chain_of_thought"]:
            
            # Get AI prediction
            ai_output = get_prediction(review_text, strategy)
            
            # Store result
            results.append({
                "review_id": index,
                "strategy": strategy,
                "actual_stars": actual_stars,
                "predicted_stars": ai_output['predicted_stars'],
                "explanation": ai_output['explanation'],
                "is_valid": 1 if (1 <= ai_output['predicted_stars'] <= 5) else 0
            })
            
            # === CRITICAL CHANGE ===
            # Sleep 4 seconds between EACH call. 
            # 3 calls per row * 4s = 12s per row.
            # 60s / 12s = 5 rows per minute (15 requests/min).
            # This is safe for the Free Tier.
            time.sleep(4) 
            
        # Progress indicator
        if (index + 1) % 5 == 0:
            print(f"Processed {index + 1} rows...")

    print("Processing complete!")
    
    # Convert list to DataFrame
    results_df = pd.DataFrame(results)
    
else:
    print("No data to process.")

Starting processing of 50 rows...
Processed 5 rows...
Processed 10 rows...
Processed 15 rows...
Processed 20 rows...
Processed 25 rows...
Processed 30 rows...
Processed 35 rows...
Processed 40 rows...
Processed 45 rows...
Processed 50 rows...
Processing complete!


In [5]:
if not results_df.empty:
    # 1. Calculate Accuracy and JSON Validity
    # We use .agg() to avoid the Pandas Future Warning
    metrics = results_df.groupby('strategy').agg(
        Accuracy=('predicted_stars', lambda x: accuracy_score(results_df.loc[x.index, 'actual_stars'], x)),
        JSON_Validity=('is_valid', 'mean') # Percentage of valid JSONs
    )
    
    print("=== FINAL RESULTS ===")
    print(metrics)
    
    # 2. Show a few mismatch examples (Good for the report)
    print("\n=== Mismatch Examples (Zero-Shot) ===")
    mismatches = results_df[
        (results_df['strategy'] == 'zero_shot') & 
        (results_df['actual_stars'] != results_df['predicted_stars'])
    ].head(3)
    
    for i, row in mismatches.iterrows():
        print(f"Actual: {row['actual_stars']} | Predicted: {row['predicted_stars']}")
        print(f"Reasoning: {row['explanation']}\n")

else:
    print("No results to show.")

=== FINAL RESULTS ===
                  Accuracy  JSON_Validity
strategy                                 
chain_of_thought      0.10           0.12
few_shot              0.12           0.16
zero_shot             0.10           0.14

=== Mismatch Examples (Zero-Shot) ===
Actual: 5 | Predicted: 0
Reasoning: API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10, model: gemini-2.5-flash-lite
Please retry in 34.343814345s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPe