In [None]:
import os
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
# Let's checkout the first few values
df = pd.read_csv('data/Train.csv')
df.head()

In [None]:
# Here we find an unexpected 0.67 label
df['label'].value_counts()

In [None]:
# Let's remove that
df = df[df['label'].isin([0, 1, -1])]
df['label'].value_counts()

In [None]:
# Split the train data 
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print(f"new dataframe shapes: train is {train.shape}, eval is {eval.shape}")

In [None]:
# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train)
eval_dataset = Dataset.from_pandas(eval)

In [None]:
# Convert labels
def transform_labels(example):
    label = example['label']
    if label == -1:  # 'Negative'
        num = 0
    elif label == 0:  # 'Neutral'
        num = 1
    elif label == 1:  # 'Positive'
        num = 2
    return {'labels': num}

# Transform labels
train_dataset = train_dataset.map(transform_labels, remove_columns=['tweet_id', 'agreement'])
eval_dataset = eval_dataset.map(transform_labels, remove_columns=['tweet_id', 'agreement'])

In [None]:
# Tokenize our data
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_data, batched=True)
eval_dataset = eval_dataset.map(tokenize_data, batched=True)

In [None]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [None]:
# Evaluate the model before training
print("Accuracy before training:")
trainer.evaluate()

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model after training
print("Accuracy after training:")
trainer.evaluate()