In [2]:
import os
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments



In [3]:
# Let's checkout the first few values
df = pd.read_csv('../data/twitter_training.csv')
df = df.head(500) # Just use subset of our data for now 

In [4]:
#Rename columns to something more intitive and drop the irrelevant ones 
df = df.drop(['Borderlands', '2401'], axis=1)
df = df.rename(columns={"Positive" : "label", "im getting on borderlands and i will murder you all ," : "Tweet"})

# Convert all tweets to strings
df['Tweet'] = df['Tweet'].astype(str)

df.head()

Unnamed: 0,label,Tweet
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [5]:
# Check the current labels
unique_tweets = df['label'].unique()
print(unique_tweets)

['Positive' 'Neutral' 'Negative' 'Irrelevant']


In [6]:
# Clean up labels and perform one hot encoding
df = df[df['label'] != 'Irrelevant']

sentiment_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
df['label'] = df['label'].map(sentiment_mapping)

unique_tweets = df['label'].unique()
print(unique_tweets)

[2 1 0]


In [7]:
train, eval_set = train_test_split(df, test_size=0.75, random_state=8, stratify=df['label'])

# Load unto a Dataset
train_dataset = Dataset.from_pandas(train)
eval_dataset = Dataset.from_pandas(eval_set)

In [8]:
# Tokenize our tweets
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_data(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_data, batched=True)
eval_dataset = eval_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/335 [00:00<?, ? examples/s]

In [9]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
# Create the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Evaluate the model before training
print("Accuracy before training:")
# trainer.evaluate()
scores = trainer.evaluate()
print(scores)
print(type(scores))

Accuracy before training:


{'eval_loss': 1.0883184671401978, 'eval_accuracy': 0.45671641791044776, 'eval_f1': 0.3284952839251947, 'eval_precision': 0.3505331901287606, 'eval_recall': 0.45671641791044776, 'eval_runtime': 3.7216, 'eval_samples_per_second': 90.015, 'eval_steps_per_second': 1.612}
<class 'dict'>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [107]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.077195,0.492537,0.33819,0.550891,0.492537
2,1.078300,1.073568,0.489552,0.331955,0.550183,0.489552
3,1.074300,1.067926,0.489552,0.331317,0.549479,0.489552


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=21, training_loss=1.075917397226606, metrics={'train_runtime': 19.8235, 'train_samples_per_second': 16.798, 'train_steps_per_second': 1.059, 'total_flos': 11028107605248.0, 'train_loss': 1.075917397226606, 'epoch': 3.0})

In [1]:
# Evaluate the model after training
print("Accuracy after training:")
trainer.evaluate()
type(trainer.evaluate())

Accuracy after training:


NameError: name 'trainer' is not defined