In [None]:

!mkdir -p ~/.kaggle


!cp kaggle.json ~/.kaggle/


!chmod 600 ~/.kaggle/kaggle.json

cp: kaggle.json: No such file or directory
chmod: /Users/juno_fung/.kaggle/kaggle.json: No such file or directory


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report
)
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (NVIDIA GPU)")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using MPS (Apple Silicon GPU)


In [4]:
MODEL_NAME = "Skywork/Skywork-Reward-V2-Qwen3-1.7B"

txt = pd.read_csv("sentiment_analysis_dataset.csv", on_bad_lines='skip')

txt.head()

Unnamed: 0,Comment,Sentiment
0,Achieving million views in days is dangerous,Positive
1,How many people here want to participate in su...,Neutral
2,Mrbeast is slowly turning into mrjigsaw,Negative
3,genuinely can't believe how dystopian this is,Negative
4,Have of the worlds smartest people compete in ...,Neutral


In [5]:
text = list(txt.iloc[:, 0].astype("str"))
labels = list(txt.iloc[:, 1].str.strip())

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    text, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = 3,
    ignore_mismatched_sizes=True
)

model.to(device)

train_encodings = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
input_ids = train_encodings['input_ids']
attention_mask = train_encodings['attention_mask']
train_labels = torch.tensor(y_train)
dataset = TensorDataset(input_ids, attention_mask, train_labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

input_ids = test_encodings['input_ids']
attention_mask = test_encodings['attention_mask']
test_labels = torch.tensor(y_test)
test_dataset = TensorDataset(input_ids, attention_mask, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

optimizer = AdamW(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 2
total_step = len(dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_step
)

In [None]:
for epoch in range(num_epochs):
    model.train()
    print(f'Epoch{epoch + 1}/{num_epochs}')
    total_train_loss = 0
    total_train_correct = 0
    total_train_samples = 0

    
    for b_input_ids, b_mask, b_labels in dataloader:
        b_input_ids = b_input_ids.to(device)
        b_mask = b_mask.to(device)
        b_labels = b_labels.to(device)

        optimizer.zero_grad()
        outputs = model(
            b_input_ids,
            attention_mask = b_mask,
            labels = b_labels
        )
        
        loss = outputs.loss
        total_train_loss += loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        total_train_correct += (preds == b_labels).sum().item()
        total_train_samples += b_labels.size(0)
        
        loss.backward()
        optimizer.step()
    avg_train_loss = total_train_loss / len(dataloader)
    train_accuracy = total_train_correct / total_train_samples

    print(f"Epoch {epoch + 1} complete. Avg Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    total_val_samples = 0
    
    with torch.no_grad():
        for b_input_ids, b_mask, b_labels in test_dataloader:
            b_input_ids=b_input_ids.to(device)
            b_mask=b_mask.to(device)
            b_labels=b_labels.to(device)
            outputs = model(
                b_input_ids,
                attention_mask = b_mask
            )
            logits = outputs.logits
            loss = criterion(logits, b_labels)
            total_val_loss += loss
            preds = torch.argmax(logits, dim=1)
            total_val_correct += (preds == b_labels).sum().item()
            total_val_samples += b_labels.size(0)
            
        avg_test_loss = total_val_loss / len(test_dataloader)
        test_accuracy = total_val_correct / total_val_samples
        print(f"Epoch {epoch + 1} complete. Avg Loss: {avg_test_loss:.4f}, Validtion Accuracy: {test_accuracy:.4f}")
        

Epoch1/2


In [None]:
y_pred = []
model.eval()
with torch.no_grad():
    for b_input_ids, b_mask, b_labels in test_dataloader:
        b_input_ids=b_input_ids.to(device)
        b_mask=b_mask.to(device)
        outputs = model(
            b_input_ids,
            attention_mask = b_mask
        )
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1)
        y_pred.append(prediction)


In [None]:
y_pred = torch.cat(y_pred)

y_pred = y_pred.flatten().cpu().detach().numpy()

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')    
f1 = f1_score(y_test, y_pred, average='macro')           


print(f"--- Performance ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\n")

print("--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Neutral (1)', 'Positive (2)']))
print("\n")

In [None]:
model.eval()

test_text = [""]
inputs = tokenizer(
    test_text,
    padding=True,
    truncation=True,
    return_tensors='pt'
).to(device)

with torch.no_grad():
    outputs = model(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask']
    )
logits = outputs.logits
print(logits)

prediction = torch.argmax(logits, dim=1)
print(f"Prediction: {prediction.item()} (0=Neg, 1=Neu, 2=Pos)")