In [24]:
import os
import json
import re
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI



In [25]:
load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1"
)



In [26]:
df = pd.read_csv("yelp.csv")

df = df[['text', 'stars']]
df = df.sample(200, random_state=42).reset_index(drop=True)

df.head()


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [34]:
def call_llm(prompt):
    try:
        response = client.chat.completions.create(
            model="meta-llama/llama-3.1-8b-instruct",
            messages=[
                {"role": "system", "content": "You are a strict JSON API."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=200
        )

        text = response.choices[0].message.content.strip()

        # Remove markdown if present
        text = re.sub(r"```json|```", "", text).strip()

        # Extract JSON safely
        start = text.find("{")
        end = text.rfind("}") + 1

        if start == -1 or end == -1:
            return None

        return text[start:end]

    except Exception as e:
        print("LLM Error:", e)
        return None
l;pp


KeyboardInterrupt: 

In [28]:
prompt_v1 = """
Read the Yelp review below and predict a star rating from 1 to 5.

Return ONLY valid JSON:
{{
  "predicted_stars": 1-5,
  "explanation": "short reason"
}}

Review:
"{review}"
"""


In [29]:
prompt_v2 = """
You are a sentiment analysis expert.

Analyze sentiment, service, and quality.

Return ONLY valid JSON:
{{
  "predicted_stars": 1-5,
  "explanation": "short reason"
}}

Review:
"{review}"
"""



In [30]:
prompt_v3 = """
You are a strict JSON generator.

Rules:
- Output ONLY valid JSON
- No extra text

JSON format:
{{
  "predicted_stars": integer (1-5),
  "explanation": string (max 20 words)
}}

Review:
"{review}"
"""


In [31]:
def evaluate(predict_fn, df):
    correct = 0
    valid_json = 0
    total = len(df)

    for _, row in tqdm(df.iterrows(), total=total):
        raw = predict_fn(row['text'])

        if raw is None:
            continue

        try:
            parsed = json.loads(raw)
            valid_json += 1

            if int(parsed["predicted_stars"]) == int(row["stars"]):
                correct += 1

        except:
            pass

    return {
        "accuracy": round(correct / total, 3),
        "json_validity": round(valid_json / total, 3)
    }


In [32]:
results = {
    "Prompt V1": evaluate(predict_v1, df),
    "Prompt V2": evaluate(predict_v2, df),
    "Prompt V3": evaluate(predict_v3, df)
}

results


100%|██████████| 200/200 [05:36<00:00,  1.68s/it]
100%|██████████| 200/200 [04:36<00:00,  1.38s/it]
100%|██████████| 200/200 [07:37<00:00,  2.29s/it]


{'Prompt V1': {'accuracy': 0.695, 'json_validity': 1.0},
 'Prompt V2': {'accuracy': 0.65, 'json_validity': 1.0},
 'Prompt V3': {'accuracy': 0.625, 'json_validity': 1.0}}

## Conclusion

All three prompting strategies achieved 100% JSON validity after enforcing strict output constraints and post-processing.
Prompt V1 achieved the highest accuracy, likely due to its flexibility in interpreting sentiment.
Prompt V2 introduced structured reasoning but showed slightly lower accuracy.
Prompt V3 prioritized reliability and formatting consistency, resulting in marginally lower accuracy but maximum robustness.

This demonstrates the trade-off between strict control and predictive performance when using LLMs for classification tasks.
