In [None]:
!pip install -q datasets evaluate
!pip install openai

In [None]:
# Setting up dataset with same format as previous work

import numpy as np
import evaluate
from datasets import load_dataset


dataset = load_dataset("stanfordnlp/sst2")

print("Original dataset structure:")
print(dataset)

# Create test set
train_valid_test = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_valid = train_valid_test["train"].train_test_split(test_size=0.1, seed=42)

dataset["train"] = train_valid["train"]
dataset["validation"] = train_valid["test"]  # Use as validation
dataset["test"] = train_valid_test["test"]   # Use as test

print("\nDataset structure:")
print(f"  Train: {len(dataset['train'])} samples")
print(f"  Validation: {len(dataset['validation'])} samples")
print(f"  Test: {len(dataset['test'])} samples")

In [None]:
# Set up open AI calls in client

from kaggle_secrets import UserSecretsClient
from openai import OpenAI
import json

user_secrets = UserSecretsClient()
client = OpenAI(api_key=user_secrets.get_secret("openAIkey"))

In [None]:
def extract_sentiment_with_openai(phrase):
    messages = [
        {
            "role": "system",
            "content": "You are a sentiment analysis expert. Classify the sentiment of the given text as either POSITIVE or NEGATIVE. Base your classification on the overall emotional tone of the text."
        },
        {
            "role": "user",
            "content": f"Classify the sentiment of the following phrase as POSITIVE or NEGATIVE. Phrase: {phrase}"
        }
    ]
    
    function_definition = [
        {
            'type': 'function',
            'function': {
                'name': 'classify_sentiment',
                'description': 'Classify the sentiment of the input text as POSITIVE or NEGATIVE.',
                'parameters': {
                    'type': 'object',
                    'properties': {
                        'sentiment': {
                            'type': 'string',
                            'enum': ['POSITIVE', 'NEGATIVE'],
                            'description': 'The sentiment classification of the text'
                        }
                    },
                    'required': ['sentiment']
                }
            }
        }
    ]
    
    response = client.chat.completions.create(
        model="gpt-5.1",
        messages=messages,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "classify_sentiment"}}
    )
    
    return json.loads(response.choices[0].message.tool_calls[0].function.arguments)

In [None]:
def evaluate_sst2(test_data):
    results = []
    correct = 0
    
    label_map = {0: 'NEGATIVE', 1: 'POSITIVE'}
    
    for example in test_data:
        sentence = example['sentence']
        true_label = label_map[example['label']]
        
        try:
            prediction = extract_sentiment_with_openai(sentence)  # Updated name
            predicted_label = prediction['sentiment']
        except Exception as e:
            print(f"Error processing: {sentence[:50]}... - {e}")
            predicted_label = "ERROR"
        
        if predicted_label == true_label:
            correct += 1
            
        results.append({
            'idx': example['idx'],
            'sentence': sentence,
            'true_label': true_label,
            'predicted_label': predicted_label
        })
    
    accuracy = correct / len(test_data)
    return results, accuracy


In [None]:
test_sample = dataset['test'].select(range(100))
results, accuracy = evaluate_sst2(test_sample)


In [None]:
print(f"Accuracy: {accuracy:.2%}")
print(results)

In [None]:
row = dataset['validation'].filter(lambda example: example['idx'] == 44896)
print(row[0]) 

### Testing
1. model: GPT-5-mini. Sample: 100. Accuracy: 86%
2. model: GPT-5.1. Sample: 100. Accuracy: 91%