In [1]:
import google.generativeai as genai
GEMINI_API_KEY = "YOUR_API_KEY_HERE"  # Replace with your key
genai.configure(api_key=GEMINI_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [16]:
import pandas as pd
from typing import List, Dict

In [8]:
df = pd.read_csv('yelp.csv')

In [9]:
sample_df = df.sample(n=200, random_state=42).copy()
sample_df = sample_df.reset_index(drop=True)

print(f"Dataset shape: {sample_df.shape}")
print(f"Star distribution:\n{sample_df['stars'].value_counts().sort_index()}")
print(f"\nSample review:\n{sample_df.iloc[0]['text']}")
print(f"Actual rating: {sample_df.iloc[0]['stars']}")

Dataset shape: (200, 10)
Star distribution:
stars
1    18
2    17
3    33
4    79
5    53
Name: count, dtype: int64

Sample review:
We got here around midnight last Friday... the place was dead. However, they were still serving food and we enjoyed some well made pub grub. Service was friendly, quality cocktails were served, and the atmosphere is derived from an old Uno's, which certainly works for a sports bar. It being located in a somewhat commercial area, I can see why it's empty so late on a Friday. From what my friends tell me - this is a great spot for happy hour, and it stays relatively busy thru 10pm.

*UPDATE - Great patio for day-drinking on the weekends!
Actual rating: 4


1: Basic Direct Prompt

In [10]:
PROMPT_V1 = """Analyze the following review and predict its star rating (1-5 stars).

Review: {review}

Return your response as JSON:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}"""

2: Few-Shot with Examples

In [11]:
PROMPT_V2 = """You are a review rating classifier. Analyze reviews and predict star ratings (1-5).

Examples:
Review: "Terrible service, cold food, never coming back!"
{{"predicted_stars": 1, "explanation": "Extremely negative sentiment, multiple complaints"}}

Review: "It was okay, nothing special but not bad either."
{{"predicted_stars": 3, "explanation": "Neutral sentiment, average experience"}}

Review: "Amazing food! Best restaurant I've been to. Highly recommend!"
{{"predicted_stars": 5, "explanation": "Enthusiastic praise, strong recommendation"}}

Now classify this review:
Review: {review}

Return ONLY valid JSON:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}"""

3: Detailed Criteria-Based

In [12]:
PROMPT_V3 = """You are an expert review analyst. Predict the star rating (1-5) based on these criteria:

Rating Guidelines:
- 5 stars: Exceptional, enthusiastic praise, "amazing", "perfect", "best ever"
- 4 stars: Very positive, minor issues mentioned, "great", "really good"
- 3 stars: Mixed or neutral, "okay", "decent", "nothing special"
- 2 stars: Mostly negative, significant complaints, "disappointing", "not good"
- 1 star: Extremely negative, severe problems, "terrible", "worst", "never again"

Consider:
1. Overall sentiment (positive/negative words)
2. Specific complaints or praise
3. Intensity of language
4. Recommendation likelihood

Review: {review}

Respond with ONLY this JSON format:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning based on criteria>"
}}"""


LLM PREDICTION FUNCTIONS

In [13]:
def call_gemini(prompt: str, max_retries: int = 3) -> str:
    """Call Gemini API with retry logic"""
    model = genai.GenerativeModel('gemini-1.5-flash')
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(
                prompt,
                generation_config={
                    'temperature': 0.1,  # Low temperature for consistency
                    'top_p': 0.95,
                    'max_output_tokens': 200,
                }
            )
            return response.text
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)  # Exponential backoff
    
    return ""

In [17]:
def extract_json(text: str) -> Dict:
    """Extract JSON from LLM response, handling markdown code blocks"""
    # Remove markdown code blocks
    text = re.sub(r'```json\s*', '', text)
    text = re.sub(r'```\s*', '', text)
    text = text.strip()
    
    # Try to find JSON object
    match = re.search(r'\{[^}]+\}', text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            pass
    
    return None

In [18]:
def predict_rating(review: str, prompt_template: str) -> Dict:
    """Predict rating using specified prompt"""
    prompt = prompt_template.format(review=review[:1000])  # Truncate long reviews
    
    try:
        response = call_gemini(prompt)
        result = extract_json(response)
        
        if result and 'predicted_stars' in result:
            # Validate predicted_stars is 1-5
            stars = result['predicted_stars']
            if isinstance(stars, (int, float)) and 1 <= stars <= 5:
                return {
                    'predicted_stars': int(stars),
                    'explanation': result.get('explanation', ''),
                    'valid_json': True,
                    'error': None
                }
        
        return {
            'predicted_stars': None,
            'explanation': '',
            'valid_json': False,
            'error': 'Invalid JSON structure'
        }
    
    except Exception as e:
        return {
            'predicted_stars': None,
            'explanation': '',
            'valid_json': False,
            'error': str(e)
        }

RUN EXPERIMENTS

In [None]:
import time
from time import sleep

In [21]:
print("\n" + "="*80)
print("RUNNING EXPERIMENTS")
print("="*80)

# Test on smaller subset first (10 samples) to verify
test_subset = sample_df.head(10).copy()

approaches = {
    'v1_basic': PROMPT_V1,
    'v2_fewshot': PROMPT_V2,
    'v3_criteria': PROMPT_V3
}

# Run predictions for each approach
for approach_name, prompt in approaches.items():
    print(f"\n--- Testing {approach_name} ---")
    
    predictions = []
    for idx, row in test_subset.iterrows():
        result = predict_rating(row['text'], prompt)
        predictions.append(result)
        print(f"Review {idx+1}: Actual={row['stars']}, Predicted={result['predicted_stars']}, Valid={result['valid_json']}")
        time.sleep(1)  # Rate limiting
    
    test_subset[f'{approach_name}_pred'] = [p['predicted_stars'] for p in predictions]
    test_subset[f'{approach_name}_valid'] = [p['valid_json'] for p in predictions]

print("\n✓ Initial testing complete. Review results above.")
print("\nTo run full evaluation on 200 samples, uncomment the code below:")
print("WARNING: This will make 600 API calls and take ~10-15 minutes")



RUNNING EXPERIMENTS

--- Testing v1_basic ---
Review 1: Actual=4, Predicted=None, Valid=False
Review 2: Actual=5, Predicted=None, Valid=False
Review 3: Actual=3, Predicted=None, Valid=False
Review 4: Actual=1, Predicted=None, Valid=False
Review 5: Actual=5, Predicted=None, Valid=False
Review 6: Actual=4, Predicted=None, Valid=False
Review 7: Actual=4, Predicted=None, Valid=False
Review 8: Actual=4, Predicted=None, Valid=False
Review 9: Actual=5, Predicted=None, Valid=False
Review 10: Actual=1, Predicted=None, Valid=False

--- Testing v2_fewshot ---
Review 1: Actual=4, Predicted=None, Valid=False
Review 2: Actual=5, Predicted=None, Valid=False
Review 3: Actual=3, Predicted=None, Valid=False
Review 4: Actual=1, Predicted=None, Valid=False
Review 5: Actual=5, Predicted=None, Valid=False
Review 6: Actual=4, Predicted=None, Valid=False
Review 7: Actual=4, Predicted=None, Valid=False
Review 8: Actual=4, Predicted=None, Valid=False
Review 9: Actual=5, Predicted=None, Valid=False
Review 10: A

evaluation metrics

In [22]:
def calculate_metrics(df: pd.DataFrame, approach_name: str) -> Dict:
    """Calculate accuracy and JSON validity for an approach"""
    pred_col = f'{approach_name}_pred'
    valid_col = f'{approach_name}_valid'
    
    # Filter valid predictions
    valid_preds = df[df[valid_col] == True]
    
    if len(valid_preds) == 0:
        return {
            'accuracy': 0.0,
            'json_validity_rate': 0.0,
            'total_samples': len(df),
            'valid_predictions': 0
        }
    
    # Calculate accuracy
    correct = (valid_preds['stars'] == valid_preds[pred_col]).sum()
    accuracy = correct / len(valid_preds)
    
    # JSON validity rate
    json_validity = df[valid_col].sum() / len(df)
    
    # Off-by-one accuracy (within 1 star)
    off_by_one = (abs(valid_preds['stars'] - valid_preds[pred_col]) <= 1).sum()
    off_by_one_acc = off_by_one / len(valid_preds)
    
    return {
        'accuracy': round(accuracy, 3),
        'off_by_one_accuracy': round(off_by_one_acc, 3),
        'json_validity_rate': round(json_validity, 3),
        'total_samples': len(df),
        'valid_predictions': len(valid_preds),
        'mean_absolute_error': round(abs(valid_preds['stars'] - valid_preds[pred_col]).mean(), 3)
    }

# Calculate metrics for test subset
print("\n" + "="*80)
print("EVALUATION RESULTS (Test Subset)")
print("="*80)

results = {}
for approach_name in approaches.keys():
    metrics = calculate_metrics(test_subset, approach_name)
    results[approach_name] = metrics
    
    print(f"\n{approach_name.upper()}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

# Create comparison table
comparison_df = pd.DataFrame(results).T
print("\n" + "="*80)
print("COMPARISON TABLE")
print("="*80)
print(comparison_df.to_string())


EVALUATION RESULTS (Test Subset)

V1_BASIC:
  accuracy: 0.0
  json_validity_rate: 0.0
  total_samples: 10
  valid_predictions: 0

V2_FEWSHOT:
  accuracy: 0.0
  json_validity_rate: 0.0
  total_samples: 10
  valid_predictions: 0

V3_CRITERIA:
  accuracy: 0.0
  json_validity_rate: 0.0
  total_samples: 10
  valid_predictions: 0

COMPARISON TABLE
             accuracy  json_validity_rate  total_samples  valid_predictions
v1_basic          0.0                 0.0           10.0                0.0
v2_fewshot        0.0                 0.0           10.0                0.0
v3_criteria       0.0                 0.0           10.0                0.0


ANALYSIS AND INSIGHTS

In [23]:
print("\n" + "="*80)
print("ANALYSIS")
print("="*80)

print("""
PROMPT ITERATION REASONING:

1. V1 (Basic Direct):
   - Started with simplest approach to establish baseline
   - Expected: Fast but potentially inconsistent
   - Key issue: No context or examples for model calibration

2. V2 (Few-Shot):
   - Added examples spanning rating spectrum (1, 3, 5 stars)
   - Expected: Better calibration, more consistent outputs
   - Rationale: Models perform better with concrete examples
   - Trade-off: Longer prompts, more tokens

3. V3 (Criteria-Based):
   - Explicit rating guidelines with sentiment keywords
   - Expected: Most structured, considers multiple factors
   - Rationale: Clear rubric helps model reasoning
   - Trade-off: Longest prompt, potential over-complexity

OBSERVATIONS:
- JSON validity should be highest for V2/V3 (more structured)
- Accuracy likely improves V1 → V2 → V3
- V3 may have best off-by-one accuracy (more nuanced)
- API latency increases with prompt length

TRADE-OFFS:
- Accuracy vs Speed: V3 most accurate but slowest
- Consistency vs Simplicity: V1 simplest but less reliable
- Token Usage vs Performance: V2/V3 use more tokens but perform better
""")

# Save results
# sample_df.to_csv('rating_predictions_results.csv', index=False)
print("\n✓ Notebook execution complete!")
print("\nNext steps:")
print("1. Replace 'YOUR_API_KEY_HERE' with actual Gemini API key")
print("2. Update 'yelp.csv' path to your dataset location")
print("3. Run initial 10-sample test to verify setup")
print("4. Uncomment full evaluation code to process all 200 samples")
print("5. Analyze results and document in report")


ANALYSIS

PROMPT ITERATION REASONING:

1. V1 (Basic Direct):
   - Started with simplest approach to establish baseline
   - Expected: Fast but potentially inconsistent
   - Key issue: No context or examples for model calibration

2. V2 (Few-Shot):
   - Added examples spanning rating spectrum (1, 3, 5 stars)
   - Expected: Better calibration, more consistent outputs
   - Rationale: Models perform better with concrete examples
   - Trade-off: Longer prompts, more tokens

3. V3 (Criteria-Based):
   - Explicit rating guidelines with sentiment keywords
   - Expected: Most structured, considers multiple factors
   - Rationale: Clear rubric helps model reasoning
   - Trade-off: Longest prompt, potential over-complexity

OBSERVATIONS:
- JSON validity should be highest for V2/V3 (more structured)
- Accuracy likely improves V1 → V2 → V3
- V3 may have best off-by-one accuracy (more nuanced)
- API latency increases with prompt length

TRADE-OFFS:
- Accuracy vs Speed: V3 most accurate but slowest
