In [89]:
import os
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [83]:
# Let's checkout the first few values
df = pd.read_csv('../data/twitter_training.csv')
df = df.head(500) # Just use subset of our data for now 

In [84]:
#Rename columns to something more intitive and drop the irrelevant ones 
df = df.drop(['Borderlands', '2401'], axis=1)
df = df.rename(columns={"Positive" : "label", "im getting on borderlands and i will murder you all ," : "Tweet"})

# Convert all tweets to strings
df['Tweet'] = df['Tweet'].astype(str)

df.head()

Unnamed: 0,label,Tweet
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [85]:
# Check the current labels
unique_tweets = df['label'].unique()
print(unique_tweets)

['Positive' 'Neutral' 'Negative' 'Irrelevant']


In [86]:
# Clean up labels and perform one hot encoding
df = df[df['label'] != 'Irrelevant']

sentiment_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
df['label'] = df['label'].map(sentiment_mapping)

unique_tweets = df['label'].unique()
print(unique_tweets)

[2 1 0]


In [87]:
train, eval_set = train_test_split(df, test_size=0.75, random_state=8, stratify=df['label'])

# Load unto a Dataset
train_dataset = Dataset.from_pandas(train)
train_dataset = Dataset.from_pandas(eval_set)

In [88]:
# Tokenize our tweets
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_data(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_data, batched=True)
eval_dataset = eval_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/335 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

In [90]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
# Create the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
# Evaluate the model before training
print("Accuracy before training:")
trainer.evaluate()

Accuracy before training:


{'eval_loss': 1.0926108360290527,
 'eval_accuracy': 0.375,
 'eval_f1': 0.2832560296846011,
 'eval_precision': 0.53385955831608,
 'eval_recall': 0.375,
 'eval_runtime': 1.413,
 'eval_samples_per_second': 79.264,
 'eval_steps_per_second': 1.415}

In [92]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0917,1.082129,0.464286,0.330064,0.531542,0.464286
2,1.0593,1.056853,0.428571,0.257143,0.183673,0.428571
3,1.0324,1.013924,0.464286,0.328362,0.529762,0.464286


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=63, training_loss=1.0640824068160284, metrics={'train_runtime': 33.7471, 'train_samples_per_second': 29.78, 'train_steps_per_second': 1.867, 'total_flos': 33283027457280.0, 'train_loss': 1.0640824068160284, 'epoch': 3.0})

In [93]:
# Evaluate the model after training
print("Accuracy after training:")
trainer.evaluate()

Accuracy after training:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.013924479484558,
 'eval_accuracy': 0.4642857142857143,
 'eval_f1': 0.32836211407639976,
 'eval_precision': 0.5297619047619048,
 'eval_recall': 0.4642857142857143,
 'eval_runtime': 0.9789,
 'eval_samples_per_second': 114.415,
 'eval_steps_per_second': 2.043,
 'epoch': 3.0}