In [9]:
import glob
from tqdm import tqdm
import os
import pandas as pd

TRAIN_DIR = 'data/WASSA-2017/train'
TEST_DIR = 'data/WASSA-2017/test'

FILE_PATTERN = '*.txt'
LABEL_SEPERATOR = '-'

def get_label_from_filename(filename, seperator=LABEL_SEPERATOR):
    base_name = os.path.basename(filename)
    label = base_name.split(seperator)[0]
    return label.lower()

def load_data(data_dir, pattern, seperator):
    all_files = glob.glob(os.path.join(data_dir, pattern))
    if not all_files:
        raise FileNotFoundError(f"No files found matching '{pattern}' in directory {data_dir}")

    df_list = []
    print(f"loading files from {data_dir}")
    for filepath in tqdm(all_files, desc="Reading files"):
        try:
            temp_df = pd.read_csv(filepath, sep='\t', header=0)
            label = get_label_from_filename(filepath, seperator)
            temp_df['emotion'] = label
            df_list.append(temp_df[['tweet', 'emotion']])

        except Exception as e:
            print(f"Error processing file {filepath}: {e}")
            continue

    if not df_list:
        raise ValueError(f"No dataframes were created from files in {data_dir}")

    combined_df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded and combined {len(combined_df)} samples")
    print(f"Found emotions: {combined_df['emotion'].unique().tolist()}")
    return combined_df

In [10]:
train_df = load_data(data_dir=TRAIN_DIR, pattern=FILE_PATTERN, seperator=LABEL_SEPERATOR)
test_df = load_data(data_dir=TEST_DIR, pattern=FILE_PATTERN, seperator=LABEL_SEPERATOR)

loading files from data/WASSA-2017/train


Reading files: 100%|██████████| 4/4 [00:00<00:00, 154.26it/s]


Loaded and combined 3613 samples
Found emotions: ['anger', 'fear', 'joy', 'sadness']
loading files from data/WASSA-2017/test


Reading files: 100%|██████████| 4/4 [00:00<00:00, 167.15it/s]

Loaded and combined 3142 samples
Found emotions: ['anger', 'fear', 'joy', 'sadness']





In [11]:
from transformers import AutoTokenizer, ConvBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("YituTech/conv-bert-base")

In [12]:
label_map = {label: i for i, label in enumerate(train_df['emotion'].unique())}
print(f"Label mapping: {label_map}")

Label mapping: {'anger': 0, 'fear': 1, 'joy': 2, 'sadness': 3}


In [13]:
import torch
from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tweets = dataframe['tweet'].tolist()
        # Convert string labels to numeric using our mapping
        self.labels = [label_map[emotion] for emotion in dataframe['emotion'].tolist()]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]

        # Tokenize the tweet
        encoding = self.tokenizer(
            tweet,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # Return PyTorch tensors
        )

        # Remove the batch dimension that the tokenizer adds by default
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}

        # Add the label
        encoding['labels'] = torch.tensor(label, dtype=torch.long)

        return encoding

In [14]:
trainset = EmotionDataset(train_df, tokenizer)
testset = EmotionDataset(test_df, tokenizer)

In [15]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [31]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [32]:
model = ConvBertForSequenceClassification.from_pretrained(
    "YituTech/conv-bert-base",
    num_labels = len(label_map)
)

Some weights of ConvBertForSequenceClassification were not initialized from the model checkpoint at YituTech/conv-bert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 