In [None]:
!pip install -q google-genai pandas pydantic scikit-learn

In [None]:
import os
import json
import pandas as pd
from pydantic import BaseModel, Field, ValidationError
from typing import Literal

from google import genai
from google.genai import types
import google.api_core.exceptions # Import for API exceptions
from sklearn.metrics import accuracy_score
import time
from google.colab import userdata

In [None]:
# Constants (Assuming these were defined in an earlier step)
MODEL = 'gemini-2.5-flash'
SAMPLE_SIZE = 200
CSV_FILE = '/content/yelp.csv'
# ---

# Set your API Key from Colab Secrets
try:
    os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
    print("API Key loaded successfully.")
except Exception as e:
    print(f"Error loading API key from secrets: {e}")
    print("ACTION REQUIRED: Check Colab Secrets panel.")

# Initialize the Gemini client
try:
    client = genai.Client()
    print("Gemini client initialized successfully.")
except Exception as e:
    print(f"Error initializing client: {e}")

API Key loaded successfully.
Gemini client initialized successfully.


In [None]:
class RatingPrediction(BaseModel):

    # rating must be an integer literal from 1 to 5, represented as strings for schema compatibility.
    predicted_stars: Literal["1", "2", "3", "4", "5"] = Field( # Changed to string literals
        ...,
        description="The predicted star rating for the review, must be an integer from 1 to 5, represented as a string."
    )
    #  must be a string.
    explanation: str = Field(
        ...,
        description="A brief, one-sentence reasoning that justifies the assigned rating."
    )

In [None]:
try:
    # Load the dataset
    df = pd.read_csv(CSV_FILE)

    # Select relevant columns: the review text (input) and actual stars (ground truth)
    df = df[['text', 'stars']]

    # Sample the data to the size r
    # A fixed random_state (42) ensures the sample is reproducible.
    df_sample = df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)

    print(f"Dataset loaded and sampled to {len(df_sample)} rows. Data preparation complete.")

except FileNotFoundError:
    print(f"Error: The file '{CSV_FILE}' was not found.")
    print("ACTION REQUIRED: Ensure you have uploaded 'yelp.csv' to your Colab session.")
except Exception as e:
    print(f"An error occurred during data loading: {e}")



Dataset loaded and sampled to 200 rows. Data preparation complete.


In [None]:
# --------------------------------------------------------------------------
# 1. DEFINE PROMPTS
# --------------------------------------------------------------------------

# --- Prompt 1: Zero-Shot (ZS) Baseline (defined earlier) ---
SYSTEM_INSTRUCTION_1 = """
You are an expert sentiment analysis model. Your task is to analyze a single Yelp review and accurately predict the star rating (1-5) the user originally assigned.

- Analyze the sentiment, tone, and specific keywords in the review.
- Strictly adhere to the provided JSON schema for your response.
- The 'predicted_stars' field MUST be an integer between 1 and 5.

Review to analyze:
"""

# --- Prompt 2: Few-Shot (FS) Improvement ---
SYSTEM_INSTRUCTION_2 = """
You are an expert sentiment analysis model specializing in classifying Yelp reviews. Your task is to accurately predict the star rating (1-5) for a given review. Use the provided examples to understand the rating standards.

- Analyze the sentiment, tone, and specific keywords.
- Strictly adhere to the provided JSON schema for your response.

--- Examples (Review Text | Rating | Explanation) ---
1. Review: "Absolutely terrible. The manager was rude and my food was cold. Never again."
   Rating: 1 Star. Explanation: Extreme negative sentiment, poor service and quality.
2. Review: "The server was great, but the coffee tasted burnt. Everything else was fine."
   Rating: 3 Stars. Explanation: Mixed sentiment; positive service balances negative product quality.
3. Review: "Best burger ever! Fast service and great atmosphere. A truly perfect experience."
   Rating: 5 Stars. Explanation: Overwhelming positive sentiment across all major categories (food, service, atmosphere).
------------------------------------------------------

Review to analyze:
"""

# --- Prompt 3: Chain-of-Thought (CoT) ---
SYSTEM_INSTRUCTION_3 = """
You are a highly analytical review classifier. Your process requires a two-step analysis before outputting the final rating.

STEP 1: REASONING (DO NOT include this in the final JSON output):
1. Identify the core subject of the review (e.g., food, service, price).
2. Determine the overall sentiment (Negative, Mixed, Positive) and its intensity.
3. Determine the final star rating (1-5) based on the analysis.

STEP 2: OUTPUT (MUST adhere to the JSON schema):
Provide the final prediction and explanation based on your reasoning.

Review to analyze:
"""

# --------------------------------------------------------------------------
# 2. EXECUTION LOOPS (All 3 Approaches)
# --------------------------------------------------------------------------

# Initialize columns for result storage
df_sample['predicted_stars_ZS'] = None
df_sample['json_valid_ZS'] = False
df_sample['predicted_stars_FS'] = None
df_sample['json_valid_FS'] = False
df_sample['predicted_stars_CoT'] = None
df_sample['json_valid_CoT'] = False

print("--- Running Evaluation for Zero-Shot (ZS) ---")
for index, row in df_sample.iterrows():
    full_prompt = SYSTEM_INSTRUCTION_1 + f"\n\n{row['text']}"
    pred_star, explanation, is_valid, raw_resp = get_prediction(full_prompt, RatingPrediction, client)
    df_sample.loc[index, 'predicted_stars_ZS'] = pred_star
    df_sample.loc[index, 'json_valid_ZS'] = is_valid
    time.sleep(0.1)


print("\n--- Running Evaluation for Few-Shot (FS) ---")
for index, row in df_sample.iterrows():
    full_prompt = SYSTEM_INSTRUCTION_2 + f"\n\n{row['text']}"
    pred_star, explanation, is_valid, raw_resp = get_prediction(full_prompt, RatingPrediction, client)
    df_sample.loc[index, 'predicted_stars_FS'] = pred_star
    df_sample.loc[index, 'json_valid_FS'] = is_valid
    time.sleep(0.1)

print("Evaluation for FS complete.")


print("\n--- Running Evaluation for Chain-of-Thought (CoT) ---")
for index, row in df_sample.iterrows():
    full_prompt = SYSTEM_INSTRUCTION_3 + f"\n\n{row['text']}"
    pred_star, explanation, is_valid, raw_resp = get_prediction(full_prompt, RatingPrediction, client)
    df_sample.loc[index, 'predicted_stars_CoT'] = pred_star
    df_sample.loc[index, 'json_valid_CoT'] = is_valid
    time.sleep(0.1)

print("Evaluation for CoT complete. All three approaches have been tested.")

# --------------------------------------------------------------------------
# 3. FINAL ANALYSIS AND COMPARISON TABLE
# --------------------------------------------------------------------------

results = {}
evaluation_data = [
    ('Zero-Shot (ZS)', 'predicted_stars_ZS', 'json_valid_ZS'),
    ('Few-Shot (FS)', 'predicted_stars_FS', 'json_valid_FS'),
    ('CoT (Structured)', 'predicted_stars_CoT', 'json_valid_CoT'),
]

for approach, predicted_col, valid_col in evaluation_data:
    valid_df = df_sample[df_sample[valid_col] == True].copy()
    num_valid = len(valid_df)
    total_samples = len(df_sample)

    json_validity_rate = (num_valid / total_samples) * 100 if total_samples > 0 else 0

    if num_valid > 0:
        y_pred_valid = valid_df[predicted_col].astype(int).tolist()
        y_true_valid = valid_df['stars'].astype(int).tolist()
        accuracy = accuracy_score(y_true_valid, y_pred_valid) * 100
    else:
        accuracy = 0.0

    results[approach] = {
        "Accuracy (on valid JSON)": f"{accuracy:.2f}%",
        "JSON Validity Rate": f"{json_validity_rate:.2f}%",
        "Total Valid Predictions": num_valid
    }

comparison_df = pd.DataFrame.from_dict(results, orient='index')
comparison_df.index.name = 'Prompting Approach'

print("\n\n####################################################################")
print("                          FINAL COMPARISON TABLE                      ")
print("####################################################################")
print(comparison_df)
print("####################################################################")

--- Running Evaluation for Zero-Shot (ZS) ---
API Error during call: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10, model: gemini-2.5-flash\nPlease retry in 45.642488396s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'q