In [1]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback

In [3]:
!pip install -U transformers


Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.1
    Uninstalling transformers-4.56.1:
      Successfully uninstalled transformers-4.56.1
Successfully installed transformers-4.56.2


In [6]:
DATA_DIR = "sentiment"


In [7]:
import pandas as pd
import os

train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
dev = pd.read_csv(os.path.join(DATA_DIR, "dev.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))



In [8]:
def clean_text(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r"\b(?:not|no|never)\s+(\w+)", r"not_\1", s)
    s = re.sub(r"https?://\S+", " <url> ", s)
    s = re.sub(r"[#@][\w_]+", " <tag> ", s)
    s = re.sub(r"([a-z])\1{2,}", r"\1\1", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def plot_confusion(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)


In [9]:
X_train, y_train = train['text'].apply(clean_text), train['label'].astype(int)
X_dev, y_dev = dev['text'].apply(clean_text), dev['label'].astype(int)
X_test = test['text'].apply(clean_text)


In [10]:
baseline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95)),
    ('clf', LogisticRegression(max_iter=200, class_weight='balanced'))
])

baseline.fit(X_train, y_train)
y_pred_dev = baseline.predict(X_dev)
print("Baseline Accuracy:", accuracy_score(y_dev, y_pred_dev))
print("Baseline F1:", f1_score(y_dev, y_pred_dev))
plot_confusion(y_dev, y_pred_dev)

Baseline Accuracy: 0.75
Baseline F1: 0.76
Confusion Matrix:
[[17  7]
 [ 5 19]]


In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [12]:
from sklearn.metrics import accuracy_score, f1_score




In [13]:
!pip install -U transformers datasets accelerate


Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading datasets-4.1.1-py3-none-any.whl (503 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's dependency resolver doe

In [14]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, max_len=128):
        self.texts = list(texts)
        self.labels = list(labels) if labels is not None else None
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        enc = tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        item = {k: v.squeeze() for k,v in enc.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

train_ds = TextDataset(X_train, y_train)
dev_ds = TextDataset(X_dev, y_dev)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=15,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=20,
    seed=42,
    report_to=[],   # disables wandb
    metric_for_best_model="f1",   # <--- important for early stopping
    greater_is_better=True
)

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
eval_results = trainer.evaluate()
print("Transformer Dev Results:", eval_results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.687,0.688646,0.5,0.666667
2,0.6313,0.416494,0.854167,0.857143
3,0.2186,0.089741,0.979167,0.979592
4,0.0264,0.03739,0.979167,0.979592
5,0.0053,0.071993,0.979167,0.979592


Transformer Dev Results: {'eval_loss': 0.08974123746156693, 'eval_accuracy': 0.9791666666666666, 'eval_f1': 0.9795918367346939, 'eval_runtime': 0.2473, 'eval_samples_per_second': 194.111, 'eval_steps_per_second': 32.352, 'epoch': 5.0}


In [15]:
# 1. Load test data
import pandas as pd
from torch.utils.data import DataLoader
import os

DATA_DIR = "sentiment"

test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
test_texts = test_df['text']
test_ds = TextDataset(test_texts, labels=None)

# 2. Make predictions
predictions = trainer.predict(test_ds)
pred_labels = np.argmax(predictions.predictions, axis=1)

# 3. Save predictions to CSV
output_df = pd.DataFrame({
    'id': test_df.index,
    'text': test_texts,
    'prediction': pred_labels
})

output_df.to_csv("sentiment_test_predictions.csv", index=False)
print("Predictions saved to sentiment_test_predictions.csv")

Predictions saved to sentiment_test_predictions.csv


In [16]:
from sklearn.metrics import precision_score, confusion_matrix
import numpy as np

# Make predictions on the dev set using the trained transformer model
dev_predictions = trainer.predict(dev_ds)
y_pred_dev_transformer = np.argmax(dev_predictions.predictions, axis=1)

# Calculate and print metrics
accuracy = accuracy_score(y_dev, y_pred_dev_transformer)
f1 = f1_score(y_dev, y_pred_dev_transformer)
precision = precision_score(y_dev, y_pred_dev_transformer)
cm = confusion_matrix(y_dev, y_pred_dev_transformer)

print("Transformer Model Evaluation on Dev Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print("Confusion Matrix:")
print(cm)

Transformer Model Evaluation on Dev Set:
Accuracy: 0.9792
F1 Score: 0.9796
Precision: 0.9600
Confusion Matrix:
[[23  1]
 [ 0 24]]
