In [1]:
import pandas as pd

df = pd.read_csv("data/yelp_reviews.csv")

df = df[["text", "stars"]]
df = df.sample(200, random_state=42)

df.head()


Unnamed: 0,text,stars
6252,We got here around midnight last Friday... the...,4
4684,Brought a friend from Louisiana here. She say...,5
1731,"Every friday, my dad and I eat here. We order ...",3
4742,"My husband and I were really, really disappoin...",1
4521,Love this place! Was in phoenix 3 weeks for w...,5


In [2]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("OPENROUTER_API_KEY")

def call_llm(prompt):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json"
        },
        json={
            "model": "mistralai/mistral-7b-instruct",
            "messages": [{"role": "user", "content": prompt}]
        }
    )
    return response.json()["choices"][0]["message"]["content"]


In [6]:
import json
from tqdm import tqdm

results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row["text"]
    actual = row["stars"]

    try:
        r1 = json.loads(call_llm(prompt_v1.format(review=review)))
        r2 = json.loads(call_llm(prompt_v2.format(review=review)))
        r3 = json.loads(call_llm(prompt_v3.format(review=review)))

        results.append({
            "actual": actual,
            "v1": r1["predicted_stars"],
            "v2": r2["predicted_stars"],
            "v3": r3["predicted_stars"]
        })
    except:
        pass


100%|██████████| 200/200 [00:00<00:00, 16595.66it/s]


In [8]:
import json
from tqdm import tqdm

results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row["text"]
    actual = row["stars"]

    row_data = {"actual": actual}

    for name, prompt in zip(
        ["v1", "v2", "v3"],
        [prompt_v1, prompt_v2, prompt_v3]
    ):
        try:
            response = call_llm(prompt.format(review=review))
            parsed = json.loads(response)
            row_data[name] = parsed.get("predicted_stars", None)
        except:
            row_data[name] = None

    results.append(row_data)


100%|██████████| 200/200 [00:00<00:00, 17223.30it/s]


In [9]:
results_df = pd.DataFrame(results)
print(results_df.head())
print(results_df.columns)


   actual    v1    v2    v3
0       4  None  None  None
1       5  None  None  None
2       3  None  None  None
3       1  None  None  None
4       5  None  None  None
Index(['actual', 'v1', 'v2', 'v3'], dtype='object')


In [10]:
def accuracy(col):
    valid = results_df[col].notna()
    return (results_df.loc[valid, col] == results_df.loc[valid, "actual"]).mean()


In [11]:
print("V1 Accuracy:", accuracy("v1"))
print("V2 Accuracy:", accuracy("v2"))
print("V3 Accuracy:", accuracy("v3"))


V1 Accuracy: nan
V2 Accuracy: nan
V3 Accuracy: nan


In [14]:
prompt_v1 = """
Read the review and predict a star rating from 1 to 5.

Review:
{review}

Respond ONLY in JSON:
{{
  "predicted_stars": number,
  "explanation": "short reason"
}}
"""


In [15]:
prompt_v2 = """
You are a strict Yelp reviewer.

Rules:
- 1 = very bad
- 5 = excellent

Review:
{review}

Return valid JSON ONLY:
{{
  "predicted_stars": integer between 1 and 5,
  "explanation": "clear reason"
}}
"""


In [16]:
prompt_v3 = """
Analyze the sentiment carefully.

Steps:
1. Decide sentiment
2. Map sentiment to stars

Review:
{review}

JSON ONLY:
{{
  "predicted_stars": 1-5,
  "explanation": "reason"
}}
"""


In [17]:
test_review = df.iloc[0]["text"]
print(call_llm(prompt_v1.format(review=test_review)))


 ```json
{
  "predicted_stars": 4,
  "explanation": "The review highlights positive aspects like well-made food, friendly service, quality cocktails, and a good atmosphere. The mention of a great patio for day-drinking and the overall positive experience justify a 4-star rating."
}
```


In [18]:
import json
from tqdm import tqdm

results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row["text"]
    actual = int(row["stars"])

    row_data = {"actual": actual}

    for name, prompt in zip(
        ["v1", "v2", "v3"],
        [prompt_v1, prompt_v2, prompt_v3]
    ):
        try:
            response = call_llm(prompt.format(review=review))
            parsed = json.loads(response)
            row_data[name] = int(parsed["predicted_stars"])
        except:
            row_data[name] = None

    results.append(row_data)

results_df = pd.DataFrame(results)
results_df.head()


100%|██████████| 200/200 [30:14<00:00,  9.07s/it] 


Unnamed: 0,actual,v1,v2,v3
0,4,,,
1,5,,,5.0
2,3,,,
3,1,,,
4,5,5.0,,5.0


In [19]:
def accuracy(col):
    valid = results_df[col].notna()
    return round((results_df.loc[valid, col] == results_df.loc[valid, "actual"]).mean(), 3)

print("Prompt V1 Accuracy:", accuracy("v1"))
print("Prompt V2 Accuracy:", accuracy("v2"))
print("Prompt V3 Accuracy:", accuracy("v3"))


Prompt V1 Accuracy: 0.759
Prompt V2 Accuracy: 0.667
Prompt V3 Accuracy: 0.565


In [20]:
def json_validity(col):
    return round(results_df[col].notna().mean(), 3)

print("V1 JSON Validity:", json_validity("v1"))
print("V2 JSON Validity:", json_validity("v2"))
print("V3 JSON Validity:", json_validity("v3"))


V1 JSON Validity: 0.145
V2 JSON Validity: 0.075
V3 JSON Validity: 0.23


In [21]:
comparison = pd.DataFrame({
    "Prompt Version": ["V1", "V2", "V3"],
    "Accuracy": [
        accuracy("v1"),
        accuracy("v2"),
        accuracy("v3")
    ],
    "JSON Validity Rate": [
        json_validity("v1"),
        json_validity("v2"),
        json_validity("v3")
    ]
})

comparison


Unnamed: 0,Prompt Version,Accuracy,JSON Validity Rate
0,V1,0.759,0.145
1,V2,0.667,0.075
2,V3,0.565,0.23
