In [None]:
import sys
import os

module_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
from ai_service import AiClient

ai_service = AiClient

In [4]:

train = pd.read_csv('../data/amdb_dataset/train/train_combined.csv')
test = pd.read_csv('../data/amdb_dataset/test/test_combined.csv')

    
train = train.sample(frac=1, random_state=42).reset_index(drop=True)
test = test.sample(frac=1, random_state=42).reset_index(drop=True)


train

Unnamed: 0,text,mark,label
0,Despite the high ratings given to this film by...,2,0
1,That's the question you have to ask yourself w...,2,0
2,"I really wish that when making a comedy, the p...",1,0
3,This is a very bland and inert production of o...,2,0
4,Amateurism best describes the film adaptation ...,1,0
...,...,...,...
1435,This movie was my first touch with Mr Sica so ...,9,1
1436,"SPOILERS All too often, Hollywood's Shakespear...",1,0
1437,Jackie Chan's Police Story is a landmark film ...,9,1
1438,This is how i felt while watching this film. I...,10,1


# BEST PROMPT

variants:

In [None]:
prompts_dict = {
    # 1. Baseline Direct
    "Baseline Direct": [
        {
            "role": "user",
            "content": "Classify the following text as Positive or Negative. "
                       "Text: {text}\n"
                       "Sentiment:",
        }
    ],
    
    # 2. CoT Reasoning
    "CoT Reasoning": [
        {
            "role": "system",
            "content": "Analyze the sentiment of the comment. "
                       "First, explain your reasoning step-by-step, mentioning specific positive or negative words. "
                       "Then, conclude with the final label: 'Positive' or 'Negative'.",
        },
        {
            "role": "user",
            "content": "Comment: {text}\n\nReasoning:",
        }
    ],

    # 3. Expert Linguist
    "Expert Linguist": [
        {
            "role": "system",
            "content": "You are an expert computational linguist specializing in sentiment analysis of user reviews. "
                       "Analyze the nuances, tone, and context of the text below. "
                       "Even if the text contains mixed emotions, determine the predominant polarity. "
                       "Return ONLY the label: Positive or Negative.",
        },
        {
            "role": "user",
            "content": "Text: {text}\nLabel:",
        }
    ],

    # 4. Sarcasm Aware
    "Sarcasm Aware": [
        {
            "role": "system",
            "content": "Classify the sentiment as Positive or Negative. "
                       "Pay close attention to sarcasm, irony, or passive-aggressive tones. "
                       "If the comment looks positive but implies a negative experience (e.g., 'Great, another bug'), label it as Negative.",
        },
        {
            "role": "user",
            "content": "Comment: {text}\nSentiment:",
        }
    ],
    
    # 5. Rating Prediction
    "Rating Prediction": [
        {
            "role": "system",
            "content": "Read the review below. "
                       "If the reviewer seems to rate the experience above average, label as Positive. "
                       "If below average or critical, label as Negative.",
        },
        {
            "role": "user",
            "content": "Review: {text}\nVerdict:",
        }
    ]
}

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import copy

generate on small samples for testing promts

In [None]:
def parse_sentiment(text):
    text = text.lower()
    if "positive" in text and "negative" not in text:
        return 1
    if "negative" in text and "positive" not in text:
        return 0
    return -1  

def evaluate_prompts(prompts_dict, data_df, sample_size=50, model_name="gpt-4o"):
    sample_df = data_df.sample(n=min(sample_size, len(data_df)), random_state=42).reset_index(drop=True)
    
    sample_df['label'] = sample_df['label'].astype(int)
    results_list = []

    for prompt_name, prompt_template in prompts_dict.items():
        print(f"Testing prompt: {prompt_name}...")
        
        preds = []
        times = []
        p_tokens = []
        c_tokens = []

        for text in tqdm(sample_df['text']):
            messages = copy.deepcopy(prompt_template)
            formatted = False

            for message in messages:
                if "{text}" in message["content"]:
                    message["content"] = message["content"].format(text=text)
                    formatted = True
            
            if not formatted:
                print("Warning: Placeholder {text} not found in prompt template!")

            raw_pred, t_sec, pt, ct = ai_service.apenai_call(messages,  model_name)

            label = parse_sentiment(raw_pred)
            
            preds.append(label)
            times.append(t_sec)
            p_tokens.append(pt)
            c_tokens.append(ct)

  
        col_prefix = f"{prompt_name}"
        sample_df[f'{col_prefix}_pred'] = preds
        sample_df[f'{col_prefix}_raw'] = [p if p != -1 else "Error" for p in preds]


        valid_indices = [i for i, x in enumerate(preds) if x != -1]
        y_true = sample_df.loc[valid_indices, 'label']
        y_pred = [preds[i] for i in valid_indices]

        if len(y_true) > 0:
            res = {
                'prompt_name': prompt_name,
                'accuracy': accuracy_score(y_true, y_pred),
                'precision': precision_score(y_true, y_pred, zero_division=0),
                'recall': recall_score(y_true, y_pred, zero_division=0),
                'f1_score': f1_score(y_true, y_pred, zero_division=0),
                'avg_time': sum(times) / len(times),
                'all_tokens': sum(p_tokens) + sum(c_tokens), 
                'p_tokens': sum(p_tokens),
                'c_tokens': sum(c_tokens),
                'errors_count': len(preds) - len(valid_indices) 
            }
        else:
            res = {'prompt_name': prompt_name, 'error': 'No valid predictions'}
            
        results_list.append(res)

    return pd.DataFrame(results_list), sample_df

In [22]:
result_list, sample_df = evaluate_prompts(prompts_dict, train)
result_list.to_csv('../sentiment_analysis/evaluate_prompts.csv')
sample_df.to_csv('../sentiment_analysis/evaluate_prompts_data.csv')

Testing prompt: Baseline Direct...


100%|██████████| 50/50 [00:48<00:00,  1.03it/s]


Testing prompt: CoT Reasoning...


100%|██████████| 50/50 [07:24<00:00,  8.90s/it]


Testing prompt: Expert Linguist...


100%|██████████| 50/50 [00:38<00:00,  1.31it/s]


Testing prompt: Sarcasm Aware...


100%|██████████| 50/50 [00:41<00:00,  1.19it/s]


Testing prompt: Rating Prediction...


100%|██████████| 50/50 [00:37<00:00,  1.32it/s]


In [23]:
result_list

Unnamed: 0,prompt_name,accuracy,precision,recall,f1_score,avg_time,all_tokens,p_tokens,c_tokens,errors_count
0,Baseline Direct,0.92,1.0,0.870968,0.931034,0.967105,17071,16860,211,0
1,CoT Reasoning,1.0,1.0,1.0,1.0,8.895375,36440,18461,17979,41
2,Expert Linguist,0.94,1.0,0.903226,0.949153,0.759615,19060,19010,50,0
3,Sarcasm Aware,0.78,1.0,0.645161,0.784314,0.835992,19060,19010,50,0
4,Rating Prediction,0.9,1.0,0.83871,0.912281,0.756147,18110,18060,50,0


In [24]:
sample_df.head()

Unnamed: 0,text,mark,label,Baseline Direct_pred,Baseline Direct_raw,CoT Reasoning_pred,CoT Reasoning_raw,Expert Linguist_pred,Expert Linguist_raw,Sarcasm Aware_pred,Sarcasm Aware_raw,Rating Prediction_pred,Rating Prediction_raw
0,"My daughter, her friends and I have watched th...",10,1,1,1,1,1,1,1,1,1,1,1
1,This film has slipped through the cracks of fi...,10,1,1,1,-1,Error,1,1,1,1,1,1
2,If you fast forward through the horrible singi...,8,1,1,1,-1,Error,1,1,0,0,1,1
3,Splendid film that in just eight minutes displ...,10,1,1,1,1,1,1,1,1,1,1,1
4,"Oh man, I know what your thinking: ""With a tit...",2,0,0,0,-1,Error,0,0,0,0,0,0


choosing the best promt

$$\text{BPS (Best Prompt Score)} = \frac{\text{F1\_score} \times \text{Accuracy}}{\text{all\_tokens} \times \text{avg\_time}} \times 10^6$$

In [26]:
SCALING_FACTOR = 1e6

result_list['BPS'] = (result_list['f1_score'] * result_list['accuracy']) / (result_list['all_tokens'] * result_list['avg_time']) * SCALING_FACTOR

df_ranked = result_list.sort_values(by='BPS', ascending=False)

print(df_ranked[['prompt_name', 'accuracy', 'f1_score', 'avg_time', 'all_tokens', 'BPS']].round(3).to_markdown(index=False))

| prompt_name       |   accuracy |   f1_score |   avg_time |   all_tokens |    BPS |
|:------------------|-----------:|-----------:|-----------:|-------------:|-------:|
| Expert Linguist   |       0.94 |      0.949 |      0.76  |        19060 | 61.624 |
| Rating Prediction |       0.9  |      0.912 |      0.756 |        18110 | 59.958 |
| Baseline Direct   |       0.92 |      0.931 |      0.967 |        17071 | 51.882 |
| Sarcasm Aware     |       0.78 |      0.784 |      0.836 |        19060 | 38.394 |
| CoT Reasoning     |       1    |      1     |      8.895 |        36440 |  3.085 |


The best one

In [6]:
expert_linguist = [
    {
        "role": "system",
        "content": "You are an expert computational linguist specializing in sentiment analysis of user reviews. "
                   "Analyze the nuances, tone, and context of the text below. "
                    "Even if the text contains mixed emotions, determine the predominant polarity. "
                   "Return ONLY the label: Positive or Negative.",
    },
    {
        "role": "user",
        "content": "Text: {text}\nLabel:",
    }
]