In [1]:
import pandas as pd
import json
import time
from openai import OpenAI

In [None]:
client = OpenAI(
    api_key="OPENROUTER_API_KEY",
    base_url="https://openrouter.ai/api/v1"
)

MODEL_NAME = "mistralai/mistral-7b-instruct"


In [3]:
df = pd.read_csv("yelp.csv")

df = df[["text", "stars"]].rename(
    columns={"text": "review_text", "stars": "true_rating"}
)

df = df.sample(200, random_state=42).reset_index(drop=True)
df.head()


Unnamed: 0,review_text,true_rating
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [4]:
prompt_v1 = """
Given the following restaurant review, predict a rating from 1 to 5 stars.
Return JSON with predicted_stars and explanation.

Review:
{review_text}
"""
prompt_v2 = """
Classify the sentiment of the following restaurant review and predict a star rating from 1 to 5.

Respond with a single JSON object only.
Do not include any extra text.

Fields required:
- predicted_stars (integer 1–5)
- explanation (short reason)

Review:
{review_text}
"""

prompt_v3 = """
Rate the restaurant review using this rubric:

1 = very negative
2 = mostly negative
3 = mixed
4 = mostly positive
5 = very positive

Respond with a single JSON object only.
Do not include any extra text.

Fields required:
- predicted_stars (integer 1–5)
- explanation (short reason)

Review:
{review_text}
"""

In [5]:
import re

def run_llm(prompt_template, review_text):
    try:
        prompt = prompt_template.format(review_text=review_text)

        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )

        raw_text = response.choices[0].message.content

        # ---- ROBUST JSON EXTRACTION ----
        match = re.search(r"\{[\s\S]*\}", raw_text)
        if not match:
            raise ValueError("No JSON found")

        json_str = match.group(0)
        parsed = json.loads(json_str)

        return {
            "predicted_stars": int(parsed["predicted_stars"]),
            "explanation": parsed.get("explanation", ""),
            "json_valid": True
        }

    except Exception as e:
        return {
            "predicted_stars": None,
            "explanation": None,
            "json_valid": False
        }


In [6]:
run_llm(prompt_v3, df.iloc[0]["review_text"])

{'predicted_stars': None, 'explanation': None, 'json_valid': False}

In [7]:
prompts = {
    "v1": prompt_v1,
    "v2": prompt_v2,
    "v3": prompt_v3
}

results = []

for _, row in df.iterrows():
    for name, prompt in prompts.items():
        res = run_llm(prompt, row["review_text"])

        results.append({
            "prompt": name,
            "true_rating": row["true_rating"],
            "predicted_rating": res["predicted_stars"],
            "json_valid": res["json_valid"]
        })

        time.sleep(0.1)  # safe for free tier

results_df = pd.DataFrame(results)
results_df.head()
# Keep only rows where prediction exists
valid_results = results_df.dropna(subset=["predicted_rating"])

valid_results.groupby("prompt").size()



prompt
v1    176
v2    189
v3    142
dtype: int64

In [8]:
uca = (
    valid_results
    .assign(correct=lambda x: x.predicted_rating == x.true_rating)
    .groupby("prompt")["correct"]
    .mean()
)

uca

prompt
v1    0.596591
v2    0.613757
v3    0.640845
Name: correct, dtype: float64

In [9]:
json_validity = results_df.groupby("prompt")["json_valid"].mean()
json_validity


prompt
v1    0.880
v2    0.945
v3    0.710
Name: json_valid, dtype: float64

In [10]:
def compute_consistency_safe(p1, p2):
    pairs = [(a, b) for a, b in zip(p1, p2) if a is not None and b is not None]
    if len(pairs) == 0:
        return 0.0
    return sum(a == b for a, b in pairs) / len(pairs)



# Consistency evaluation on repeated runs
sample_df = df.sample(20, random_state=1)

consistency_rows = []

for name, prompt in prompts.items():
    preds_1, preds_2 = [], []

    for _, row in sample_df.iterrows():
        preds_1.append(run_llm(prompt, row["review_text"])["predicted_stars"])
        preds_2.append(run_llm(prompt, row["review_text"])["predicted_stars"])
        time.sleep(0.1)

    consistency_rows.append({
        "prompt": name,
        "consistency": compute_consistency_safe(preds_1, preds_2)
    })

consistency_df = pd.DataFrame(consistency_rows)
consistency_df

Unnamed: 0,prompt,consistency
0,v1,0.823529
1,v2,0.85
2,v3,1.0


In [11]:
final_table = pd.concat(
    [
        uca.rename("accuracy"),
        json_validity.rename("json_validity")
    ],
    axis=1
).reset_index()

final_table = final_table.merge(consistency_df, on="prompt")
final_table


Unnamed: 0,prompt,accuracy,json_validity,consistency
0,v1,0.596591,0.88,0.823529
1,v2,0.613757,0.945,0.85
2,v3,0.640845,0.71,1.0
