In [80]:
!pip install transformers



In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import torch

file_path = "train_data/dontpatronizeme_pcl.tsv"
train_filepath = "dev_data/train_semeval_parids-labels.csv"
dev_filepath = "dev_data/dev_semeval_parids-labels.csv"

df = pd.read_csv(file_path, sep='\t', header=2, names=['id', 'paragraph-id', 'keyword', 'countrycode', "paragraph", "label"])
df_filtered = df[df['paragraph'].notna()]

train_df = pd.read_csv(train_filepath)
dev_df = pd.read_csv(dev_filepath)

train_data = df_filtered[df_filtered['id'].isin(train_df['par_id'])]
dev_data = df_filtered[df_filtered['id'].isin(dev_df['par_id'])]

train_data_shuffled = shuffle(train_data, random_state=42)
dev_data_shuffled = shuffle(dev_data, random_state=42)

X_train = train_data_shuffled['paragraph'].to_numpy()
X_dev = dev_data_shuffled['paragraph'].to_numpy()

y_train = [0 if int(x) <= 1 else 1 for x in train_data_shuffled['label']]
y_dev = [0 if int(x) <= 1 else 1 for x in dev_data_shuffled['label']]

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
train_data_shuffled

Unnamed: 0,id,paragraph-id,keyword,countrycode,paragraph,label
4822,4823,@@8781228,disabled,gb,"As well as lying about helping flood victims ,...",0
8324,8325,@@14942580,vulnerable,ng,Betty Abah is passionate about this initiative...,4
2383,2384,@@7600715,in-need,sg,""" He liked to help people so I thought this co...",1
4288,4289,@@8869471,vulnerable,ng,""" The airlines are relatively small , weak and...",0
5507,5508,@@23720891,refugee,ie,"In general , people live inside their own bubb...",3
...,...,...,...,...,...,...
5846,5847,@@19919480,homeless,my,"Last year , a record 85 homes were demolished ...",0
5290,5291,@@21695353,homeless,pk,""" As a country , we can look for the missed op...",0
5491,5492,@@14069020,immigrant,hk,Any opening in which the speakers can revert t...,0
879,880,@@24188457,in-need,ke,Dennis insisted that his initiative was not in...,3


In [3]:
dev_data_shuffled

Unnamed: 0,id,paragraph-id,keyword,countrycode,paragraph,label
10056,10057,@@4197415,poor-families,ca,Darte acknowledged cutting back to the Windsor...,0
9650,9651,@@25216962,migrant,bd,UNITED States President Donald Trump has defen...,0
9119,9120,@@22467955,immigrant,ca,Saraswat said most immigrants have unique livi...,0
8504,8505,@@10179731,hopeless,pk,He said some elements were bent upon spreading...,0
1282,1283,@@3208839,refugee,ph,""" Stateless "" is the story of a forgotten grou...",4
...,...,...,...,...,...,...
9977,9978,@@13589752,homeless,in,One response to marital infidelity is divorce ...,0
9384,9385,@@1955909,homeless,tz,Various other areas have been experiencing exc...,0
9423,9424,@@18374692,hopeless,ca,Chris Selley : Maybe liquor retail in Ontario ...,0
9594,9595,@@1065878,hopeless,us,Robin Wauters is the European Editor of The Ne...,0


In [4]:
print(len(X_train))
print(len(X_dev))
print(len(y_train))
print(len(y_dev))

8375
2093
8375
2093


In [10]:
import wandb

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'eval/f1_score',
        'goal': 'maximize'
    },
    'parameters': {

        'epochs': {
            'values' : [3, 5, 7]
        },

        'batch_size': {
            'values': [8, 16, 32]
        },

        'warmup_steps': {
            'values': [100, 500]
        },

        'learning_rate': {
            'values': [1e-5, 2e-5, 3e-5]
        }
    }
}

sweep_defaults = {
        'learning_rate': 2e-5,

        'batch_size': 16,

        'epochs': 5,

        'warmup_steps': 500
}

sweep_id = wandb.sweep(sweep_config)

Create sweep with ID: cugcnov1
Sweep URL: https://wandb.ai/eli-carried/uncategorized/sweeps/cugcnov1


In [9]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [5]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
!pip install accelerate -U
!pip install transformers[torch]



In [11]:
import wandb
import torch
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import gc
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()

# Call the garbage collector
gc.collect()
    
# Ensure CUDA is aware of the freed memory
torch.cuda.empty_cache()

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    
    return {"f1_score": f1, "accuracy": accuracy, "precision": precision, "recall": recall}

def train():

    wandb.init()

    total_t0 = time.time()

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_length = 300
    dataset = CustomDataset(X_train, y_train, tokenizer, max_length)

    train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    model.to(device)
    # train_dataset = torch.tensor(train_dataset).to(device)
    # val_dataset = torch.tensor(val_dataset).to(device)

    training_args = TrainingArguments(
        output_dir='./results',          
        per_device_train_batch_size=wandb.config.batch_size,  
        per_device_eval_batch_size=wandb.config.batch_size,
        num_train_epochs=wandb.config.epochs,
        learning_rate=wandb.config.learning_rate,
        warmup_steps=wandb.config.warmup_steps,
        evaluation_strategy="epoch",     
        logging_dir='./logs',            
        logging_steps=100,
        metric_for_best_model = 'f1',
        report_to="wandb",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    predictions = trainer.predict(val_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids

    f1 = f1_score(labels, preds)
    accuracy = accuracy_score(labels, preds)


    wandb.log({"f1_score": f1,
               "accuracy": accuracy})

    print("Training complete!")

    torch.cuda.empty_cache()

    # Call the garbage collector
    gc.collect()
    
    # Ensure CUDA is aware of the freed memory
    torch.cuda.empty_cache()

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
wandb.agent(sweep_id, function=train)



In [12]:
torch.cuda.empty_cache()

# Call the garbage collector
gc.collect()
    
# Ensure CUDA is aware of the freed memory
torch.cuda.empty_cache()

In [71]:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_length = 300
    dataset = CustomDataset(X_train, y_train, tokenizer, max_length)

    train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

print(type(train_dataset))
print(train_dataset[:5])

<class 'list'>
[{'input_ids': tensor([  101,  5920,  1999, 16634, 11431,  4904,  2003,  3303,  2011,  1037,
         2193,  1997,  5876,  1010,  2164,  2740,  3471,  1010,  5850,  1998,
         6544,  1010, 18917,  1010,  5635,  1998, 20625,  2791,  1010,  2016,
         2056,  1010,  1998,  1000,  1999,  2344,  2000,  2644,  2111,  2013,
        16873,  5920,  1010,  2057,  2342,  2000,  2298,  2012,  2122,  3314,
         1012,  2065,  2057,  2079,  1050,  1005,  1056,  2156,  2068,  2030,
         2963,  2055,  2009,  1010,  2009, 24185,  1050,  1005,  1056,  2175,
         2185,  1012,  1000,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0