In [1]:
import pandas as pd

# Load your file
df = pd.read_csv("data/yelp.csv")

# Check columns
print(df.columns)


Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')


In [2]:
df = df[['text', 'stars']].dropna()   # adjust column name if needed

sample_df = df.sample(n=200, random_state=42)

sample_df.to_csv("data/yelp_sample_200.csv", index=False)

print("Saved:", len(sample_df))


Saved: 200


In [4]:
import json
from collections import Counter

def safe_parse_json(text):
    try:
        data = json.loads(text)
        if (
            isinstance(data, dict)
            and 'predicted_stars' in data
            and 'explanation' in data
            and isinstance(data['predicted_stars'], int)
            and 1 <= data['predicted_stars'] <= 5
        ):
            return data, True
    except Exception:
        pass
    return None, False


In [5]:
def prompt_v1(review):
    return f"""
You are a ratings classifier. Read the review and return ONLY a valid JSON object with two fields:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<one-sentence justification>"
}}

Rules:
1) predicted_stars must be an integer between 1 and 5 inclusive.
2) explanation must be concise (10-25 words).
3) Do not output any text other than the JSON.

Review:
"{review}"
"""


In [3]:
!pip install -q openai


In [5]:
import os

os.environ["OPENAI_API_KEY"] = "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"


In [6]:
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


In [16]:
!pip install -q openai


In [19]:
import os

os.environ["OPENROUTER_API_KEY"] = "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"


In [21]:
from openai import OpenAI
import os

# Set env var correctly
os.environ["OPENROUTER_API_KEY"] = "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"

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ["OPENROUTER_API_KEY"]
)


In [22]:
response = client.chat.completions.create(
    model="meta-llama/llama-3.1-8b-instruct",
    messages=[{"role": "user", "content": "Say hello in JSON: {\"msg\": \"hello\"}"}],
    temperature=0
)

print(response.choices[0].message.content)


`{"msg": "hello"}`


In [23]:
MODEL_NAME = "meta-llama/llama-3.1-8b-instruct"


In [24]:
def call_llm(prompt):
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    return response.choices[0].message.content


In [25]:
def prompt_v1(review):
    return f"""
You are a ratings classifier. Read the review and return ONLY a valid JSON object with two fields:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<one-sentence justification>"
}}

Rules:
1) predicted_stars must be an integer between 1 and 5 inclusive.
2) explanation must be concise (10-25 words).
3) Do not output any text other than the JSON.

Review:
"{review}"
"""


In [26]:
test = call_llm(prompt_v1("Amazing food, friendly staff, will visit again!"))
print(test)


{
  "predicted_stars": 5,
  "explanation": "Excellent experience with great food and service, indicating a perfect 5-star review."
}


In [27]:
import json

def safe_parse_json(text):
    try:
        data = json.loads(text)
        if (
            isinstance(data, dict)
            and "predicted_stars" in data
            and "explanation" in data
            and isinstance(data["predicted_stars"], int)
            and 1 <= data["predicted_stars"] <= 5
        ):
            return data, True
    except Exception:
        pass
    return None, False


In [28]:
parsed, valid = safe_parse_json(test)
parsed, valid


({'predicted_stars': 5,
  'explanation': 'Excellent experience with great food and service, indicating a perfect 5-star review.'},
 True)

In [31]:
import pandas as pd

# Load your sampled dataset
sample_df = pd.read_csv("data/yelp_sample_200.csv")

# Take only first 5 rows
debug_df = sample_df.head(5)

results_debug = []

for _, row in debug_df.iterrows():
    review = row["text"]
    actual = int(row["stars"])

    raw = call_llm(prompt_v1(review))
    parsed, valid = safe_parse_json(raw)

    results_debug.append({
        "review": review[:80] + "...",
        "actual": actual,
        "predicted": parsed["predicted_stars"] if valid else None,
        "valid_json": valid
    })

pd.DataFrame(results_debug)


Unnamed: 0,review,actual,predicted,valid_json
0,We got here around midnight last Friday... the...,4,4,True
1,Brought a friend from Louisiana here. She say...,5,5,True
2,"Every friday, my dad and I eat here. We order ...",3,4,True
3,"My husband and I were really, really disappoin...",1,1,True
4,Love this place! Was in phoenix 3 weeks for w...,5,5,True


In [32]:
results_v1 = []

for _, row in sample_df.iterrows():
    review = row["text"]
    actual = int(row["stars"])

    raw = call_llm(prompt_v1(review))
    parsed, valid = safe_parse_json(raw)

    pred = parsed["predicted_stars"] if valid else None

    results_v1.append({
        "actual": actual,
        "predicted": pred,
        "valid_json": valid
    })

len(results_v1)


200

In [33]:
import pandas as pd

df_v1 = pd.DataFrame(results_v1)

accuracy_v1 = (df_v1["actual"] == df_v1["predicted"]).mean()
json_valid_v1 = df_v1["valid_json"].mean()

accuracy_v1, json_valid_v1


(np.float64(0.695), np.float64(1.0))

In [34]:
df_v1.to_csv("results_prompt_v1.csv", index=False)


In [35]:
df_v1.to_csv("results_prompt_v1.csv", index=False)
print("Saved results_prompt_v1.csv")


Saved results_prompt_v1.csv


In [36]:
def prompt_v2(review):
    return f"""
You are a ratings classifier. Below are examples of reviews with correct star ratings.

Example 1:
Review: "Food was cold and service was terrible."
Output:
{{"predicted_stars": 1, "explanation": "Cold food and poor service indicate a very bad experience."}}

Example 2:
Review: "It was okay, nothing special, but not bad either."
Output:
{{"predicted_stars": 3, "explanation": "An average experience with no strong positives or negatives."}}

Example 3:
Review: "Amazing ambience, tasty food, and very friendly staff!"
Output:
{{"predicted_stars": 5, "explanation": "Excellent food, atmosphere, and service suggest a great experience."}}

Now classify the following review.

Rules:
- Return ONLY a valid JSON object
- predicted_stars must be an integer from 1 to 5
- explanation must be concise (10–25 words)

Review:
"{review}"
"""


In [37]:
test_v2 = call_llm(prompt_v2("Amazing food, friendly staff, will visit again!"))
print(test_v2)


{"predicted_stars": 5, "explanation": "Excellent food and service suggest a great experience, with a high likelihood of repeat visits."}


In [38]:
results_v2 = []

for _, row in sample_df.iterrows():
    review = row["text"]
    actual = int(row["stars"])

    raw = call_llm(prompt_v2(review))
    parsed, valid = safe_parse_json(raw)

    pred = parsed["predicted_stars"] if valid else None

    results_v2.append({
        "actual": actual,
        "predicted": pred,
        "valid_json": valid
    })

import pandas as pd
df_v2 = pd.DataFrame(results_v2)


In [39]:
accuracy_v2 = (df_v2["actual"] == df_v2["predicted"]).mean()
json_valid_v2 = df_v2["valid_json"].mean()

accuracy_v2, json_valid_v2


(np.float64(0.405), np.float64(0.64))

In [40]:
df_v2.to_csv("results_prompt_v2.csv", index=False)
print("Saved Prompt V2 results")


Saved Prompt V2 results


In [41]:
def prompt_v3(review):
    return f"""
You are an expert sentiment and rating analyst.

Task:
1. Carefully analyze the sentiment and intent of the review.
2. Decide the most appropriate star rating from 1 to 5.
3. Return ONLY a valid JSON object.

Output format:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<clear but concise justification>"
}}

Constraints:
- predicted_stars must be an integer between 1 and 5.
- explanation must be 15–30 words.
- Do not include any text outside the JSON.

Review:
"{review}"
"""


In [43]:
results_v3 = []

for _, row in sample_df.iterrows():
    review = row["text"]
    actual = int(row["stars"])

    raw = call_llm(prompt_v3(review))   # V3 prompt
    parsed, valid = safe_parse_json(raw)

    pred = parsed["predicted_stars"] if valid else None

    results_v3.append({
        "actual": actual,
        "predicted": pred,
        "valid_json": valid
    })

import pandas as pd
df_v3 = pd.DataFrame(results_v3)


In [44]:
accuracy_v3 = (df_v3["actual"] == df_v3["predicted"]).mean()
json_valid_v3 = df_v3["valid_json"].mean()

accuracy_v3, json_valid_v3


(np.float64(0.66), np.float64(1.0))

In [45]:
df_v3.to_csv("results_prompt_v3.csv", index=False)
print("Saved Prompt V3 results")


Saved Prompt V3 results


In [46]:
summary = pd.DataFrame([
    {
        "Prompt": "V1 Zero-shot strict JSON",
        "Accuracy": accuracy_v1,
        "JSON Validity": json_valid_v1
    },
    {
        "Prompt": "V2 Few-shot",
        "Accuracy": accuracy_v2,
        "JSON Validity": json_valid_v2
    },
    {
        "Prompt": "V3 Reasoned",
        "Accuracy": accuracy_v3,
        "JSON Validity": json_valid_v3
    }
])

summary


Unnamed: 0,Prompt,Accuracy,JSON Validity
0,V1 Zero-shot strict JSON,0.695,1.0
1,V2 Few-shot,0.405,0.64
2,V3 Reasoned,0.66,1.0


### Prompt Analysis & Observations


**Prompt V1 (Zero-shot, strict JSON)**  
This prompt focused on explicit instructions and strict output formatting.  
It achieved the highest accuracy and perfect JSON validity, showing that clear constraints help the model classify sentiment reliably.

**Prompt V2 (Few-shot)**  
Few-shot examples were added to calibrate rating boundaries.  
However, accuracy and JSON validity dropped, likely due to the model overfitting to examples or deviating from strict formatting.

**Prompt V3 (Reasoned / Structured)**  
This prompt encouraged internal reasoning while still enforcing JSON-only output.  
It maintained perfect JSON validity and reasonable accuracy, indicating improved reliability on complex reviews.


	Prompt	Accuracy	JSON Validity
0	V1 Zero-shot strict JSON	0.695	1.00
1	V2 Few-shot	0.405	0.64
2	V3 Reasoned	0.660	1.00
