In [None]:
import pandas as pd
import sys
print(sys.executable)
import numpy as np
import json
import re
from tqdm import tqdm


In [None]:
DATASET_PATH = "yelp_review.csv"

df = pd.read_csv(DATASET_PATH)

print("Original shape:", df.shape)
df.head()


In [None]:
df = df[['text', 'stars']].dropna()
df.rename(columns={'text': 'review', 'stars': 'actual_stars'}, inplace=True)

print("After cleaning:", df.shape)
df.head()


In [None]:
SAMPLE_SIZE = 200

df_sample = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True)

df_sample['actual_stars'].value_counts()


In [None]:
for i in range(3):
    print(f"\n--- Review {i+1} ---")
    print(df_sample.loc[i, 'review'][:300])
    print("Actual stars:", df_sample.loc[i, 'actual_stars'])


In [None]:
evaluation_rows = []
print("Evaluation structure ready.")


In [None]:
# OpenRouter LLM call setup (OpenAI‚Äëcompatible)
import os
from openai import OpenAI

# Ensure you have set your OpenRouter API key in environment
# e.g., export OPENROUTER_API_KEY="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"
openrouter_key = "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"

# Initialize OpenAI client to point to OpenRouter endpoint
client = OpenAI(
    base_url="https://openrouter.ai/api/v1", 
    api_key=openrouter_key
)

# Choose a model available via OpenRouter
MODEL_NAME = "openrouter/auto"  # let OpenRouter automatically pick a free model
# Alternative explicit free models if you want:
# MODEL_NAME = "mistralai/mistral-7b-instruct:free"
# MODEL_NAME = "anthropic/claude-3-haiku:free"

def call_llm_openrouter(prompt_text):
    """
    Sends prompt to OpenRouter and returns raw text response.
    Stops execution on credit errors.
    """
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are an expert sentiment analyst."},
                {"role": "user", "content": prompt_text}
            ],
            temperature=0,
            max_tokens=150  # üî• MUST be low for free/low credits
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        error_msg = str(e)

        # üö® Stop execution if credits are exhausted
        if "402" in error_msg or "Insufficient credits" in error_msg:
            print("‚ùå OPENROUTER CREDITS EXHAUSTED ‚Äî STOPPING EXECUTION")
            raise SystemExit()

        print("LLM call failed:", error_msg)
        return None


In [None]:
def parse_json_safe(llm_output):
    """
    Safely parse JSON from LLM output.
    Returns (parsed_dict, valid_flag)
    """
    try:
        # Fix common JSON issues (like single quotes ‚Üí double quotes)
        clean_output = llm_output.replace("'", '"')
        parsed = json.loads(clean_output)
        
        # Ensure predicted_stars exists and is 1-5
        stars = parsed.get("predicted_stars", None)
        if stars is None or not (1 <= int(stars) <= 5):
            return None, False
        
        return parsed, True
    except Exception as e:
        return None, False


In [None]:
# Load Prompt V1 text
with open("prompts/prompt_v1.txt", "r") as f:
    prompt_template = f.read()

evaluation_rows = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    review_text = row["review"]
    actual_stars = row["actual_stars"]

    prompt_filled = prompt_template.replace("{review_text}", review_text)

    # Call OpenRouter LLM
    llm_output = call_llm_openrouter(prompt_filled)

    # ‚õî Stop entire loop if credits are exhausted
    if llm_output is None:
        print("Stopping loop due to API failure / no credits.")
        break

    # Parse JSON safely
    parsed_json, valid_flag = parse_json_safe(llm_output)

    evaluation_rows.append({
        "prompt_version": "v1",
        "review": review_text,
        "actual_stars": actual_stars,
        "predicted_stars": parsed_json.get("predicted_stars") if parsed_json else None,
        "explanation": parsed_json.get("explanation") if parsed_json else None,
        "json_valid": valid_flag,
        "llm_raw_output": llm_output
    })


In [86]:
results_df = pd.DataFrame(evaluation_rows)
results_df.to_csv("results/evaluation_v1.csv", index=False)
results_df.head()


Unnamed: 0,prompt_version,review,actual_stars,predicted_stars,explanation,json_valid,llm_raw_output
0,v1,We got here around midnight last Friday... the...,4,,,False,
1,v1,Brought a friend from Louisiana here. She say...,5,,,False,
2,v1,"Every friday, my dad and I eat here. We order ...",3,,,False,
3,v1,"My husband and I were really, really disappoin...",1,,,False,
4,v1,Love this place! Was in phoenix 3 weeks for w...,5,,,False,


In [None]:
import pandas as pd

df_v1 = pd.DataFrame(evaluation_rows)
print("Rows collected:", len(df_v1))


In [None]:
import os

os.makedirs("results", exist_ok=True)
df_v1.to_csv("results/prompt_v1_results.csv", index=False)

print("Saved partial results successfully")


In [None]:
# Load Prompt V2
with open("prompts/prompt_v2.txt", "r") as f:
    prompt_v2_template = f.read()

# Reset evaluation list for V2
evaluation_v2 = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    review_text = row['review']
    actual_stars = row['actual_stars']
    
    prompt_filled = prompt_v2_template.replace("{review_text}", review_text)
    
    llm_output = call_llm_openrouter(prompt_filled)
    parsed_json, valid_flag = parse_json_safe(llm_output) if llm_output else (None, False)
    
    evaluation_v2.append({
        "prompt_version": "v2",
        "review": review_text,
        "actual_stars": actual_stars,
        "predicted_stars": parsed_json.get("predicted_stars") if parsed_json else None,
        "explanation": parsed_json.get("explanation") if parsed_json else None,
        "json_valid": valid_flag,
        "llm_raw_output": llm_output
    })


In [None]:
results_v2_df = pd.DataFrame(evaluation_v2)
results_v2_df.to_csv("results/evaluation_v2.csv", index=False)
results_v2_df.head()


In [None]:
with open("prompts/prompt_v3.txt", "r") as f:
    prompt_v3_template = f.read()

evaluation_v3 = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    review_text = row['review']
    actual_stars = row['actual_stars']
    
    prompt_filled = prompt_v3_template.replace("{review_text}", review_text)
    
    llm_output = call_llm_openrouter(prompt_filled)
    parsed_json, valid_flag = parse_json_safe(llm_output) if llm_output else (None, False)
    
    evaluation_v3.append({
        "prompt_version": "v3",
        "review": review_text,
        "actual_stars": actual_stars,
        "predicted_stars": parsed_json.get("predicted_stars") if parsed_json else None,
        "explanation": parsed_json.get("explanation") if parsed_json else None,
        "json_valid": valid_flag,
        "llm_raw_output": llm_output
    })


In [None]:
results_v3_df = pd.DataFrame(evaluation_v3)
results_v3_df.to_csv("results/evaluation_v3.csv", index=False)
results_v3_df.head()

In [None]:
def compute_metrics(df):
    if df.empty:
        return {
            "accuracy": 0,
            "json_validity": 0
        }

    required_cols = {"predicted_stars", "actual_stars", "json_valid"}
    if not required_cols.issubset(df.columns):
        return {
            "accuracy": 0,
            "json_validity": 0
        }

    total = len(df)
    accuracy = (df["predicted_stars"] == df["actual_stars"]).sum()
    valid_json = df["json_valid"].sum()

    return {
        "accuracy": round((accuracy / total) * 100, 2),
        "json_validity": round((valid_json / total) * 100, 2)
    }


In [None]:
# Combine all evaluations into one DataFrame
final_results_df = pd.concat([pd.DataFrame(evaluation_rows),
                              results_v2_df,
                              results_v3_df], ignore_index=True)

# Save final CSV
final_results_df.to_csv("results/final_task1_results.csv", index=False)
print("Final Task 1 results saved: results/final_task1_results.csv")


In [None]:
comparison.to_csv("results/comparison_table.csv", index=False)
print("Comparison table saved: results/comparison_table.csv")
comparison
