In [1]:
import pandas as pd 

train = pd.read_csv('../data/kaggle_bias/train.csv')

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
!pip install datasets transformers huggingface_hub



In [4]:
train = train.dropna(subset=['comment_text'])
train.reset_index(drop=True, inplace=True)
train['bi_target'] = (train['target'] >= 0.5).astype(int)

In [5]:
train_sample = train.sample(frac=0.025, random_state=42)

Target class distribution:

In [6]:
(train_sample['bi_target'].sum() / len(train_sample) ) * 100

8.047072381543371

In [7]:
from torch.utils.data import Dataset, DataLoader
import torch

class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['comment_text']
        label = self.dataframe.iloc[idx]['bi_target']
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=215, return_tensors="pt")
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)

In [12]:
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_df, val_df = train_test_split(train_sample, test_size=0.1, random_state=42, stratify=train_sample['bi_target'])

train_dataset = MyDataset(train_df, tokenizer)
val_dataset = MyDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [13]:
(val_df['bi_target'].sum() / len(val_df) ) * 100

8.04343009084866

In [14]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# makes the model run on the GPU instead of CPU
model.cuda()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
from transformers import AdamW
from tqdm.auto import tqdm
from torch.optim.lr_scheduler import StepLR

# class weight for 5% target class
class_weights = [1, 2]

# convert class weight to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).cuda()


# AdamW optimizer is apparently really good for DistilBERT?  Will write more in docs
optimizer = AdamW(model.parameters(), lr=7.5e-5)

# Define the scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

# Training loop
model.train()
for epoch in range(3):  
    scheduler.step()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for input_ids, attention_mask, labels in progress_bar:
        # Move the training to the GPU
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        labels = labels.cuda()

        # Set gradients to zero for training
        model.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(outputs.logits, labels)

        # loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'loss': loss.item()})



Epoch 1:   0%|          | 0/2539 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/2539 [00:00<?, ?it/s]

Epoch 3:   0%|          | 0/2539 [00:00<?, ?it/s]

In [16]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


model.eval()  # Transformers built in model evaluation kit 
true_labels = []
pred_labels = []

with torch.no_grad():  # Disable gradient calculation
    for input_ids, attention_mask, labels in val_loader:
        # Move tensors to the GPU
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        labels = labels.cuda()
        
        # Forward pass, get logit predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        # Store predictions and true labels
        true_labels.append(label_ids)
        pred_labels.append(logits.argmax(axis=1))

# Calculate our accuracy metrics
true_labels = np.concatenate(true_labels)
pred_labels = np.concatenate(pred_labels)


accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9434965654775094
Precision: 0.6398963730569949
Recall: 0.6804407713498623
F1 Score: 0.6595460614152204


#### First run
5% of the data, class weights [1,19], base

Accuracy: 0.8674939064923555

Precision: 0.33863080684596575

Recall: 0.8293413173652695

F1 Score: 0.48090277777777773

#### Second run
2.5% of the data, class weights [1,19], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.8901196278245459

Precision: 0.42493638676844786

Recall: 0.8835978835978836

F1 Score: 0.5738831615120276

#### Third run
2.5% of the data, no class weights, scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.9424014178112539

Precision: 0.6710526315789473

Recall: 0.5604395604395604

F1 Score: 0.6107784431137724

#### Fourth run

random state 42 sampling, 2.5% of data, class weights [1, 11.5], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.8958794860434205

Precision: 0.41456582633053224

Recall: 0.8505747126436781

F1 Score: 0.5574387947269304

All above on 3 epochs --------------------------------------------------------

Same but 5 epochs:

Accuracy: 0.8887904297740363

Precision: 0.41494845360824745

Recall: 0.8702702702702703

F1 Score: 0.5619546247818499

#### Fifth run
random state 42 sampling, 2.5% of data, class weights [1, 5], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.9158174568010634

Precision: 0.5117056856187291

Recall: 0.7766497461928934

F1 Score: 0.6169354838709677

#### Sixth run
random state 42 sampling, 2.5% of data, class weights [1, 2], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.9415152857775808

Precision: 0.65

Recall: 0.6770833333333334

F1 Score: 0.6632653061224489

#### Seventh run
random state 42 sampling, 2.5% of data, class weights [1, 1.75], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.9472751439964555

Precision: 0.6666666666666666

Recall: 0.7204301075268817

F1 Score: 0.6925064599483204

#### loss 5e-6

Accuracy: 0.9295525033229951

Precision: 0.6835443037974683

Recall: 0.2872340425531915

F1 Score: 0.40449438202247195

#### loss 7.5e-5
random state 42 sampling, 2.5% of data, class weights [1, 1.7], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

Accuracy: 0.9441736818786

Precision: 0.7395833333333334

Recall: 0.6513761467889908

F1 Score: 0.6926829268292682

#### Eighth run
- random state 42 sampling, 2.5% of data, class weights [1, 2], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
- val_set 10% of data instead of 5%
- target class 8% in train, 8.5% in val
    
Accuracy: 0.941945490804343

Precision: 0.6431924882629108

Recall: 0.7135416666666666

F1 Score: 0.6765432098765432

#### Ninth run
- random state 42 sampling, 2.5% of data, class weights [1, 1.5], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
- val_set 10% of data instead of 5%
- target class 8% in train, 8.5% in val
    
Accuracy: 0.9410591624196765

Precision: 0.62882096069869

Recall: 0.75

F1 Score: 0.684085510688836

#### Tenth run
- random state 42 sampling, 2.5% of data, class weights [1, 3], scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
- val_set 10% of data instead of 5%
- target class 8% in train, 8.5% in val
    
Accuracy: 0.9361843563040106

Precision: 0.596

Recall: 0.7760416666666666

F1 Score: 0.67420814479638

#### Eleventh run
- random state 42 sampling, 2.5% of data, class weights [1, 2], scheduler = StepLR(optimizer, step_size=1, gamma=0.1), loss = 9e-5
- val_set 10% of data instead of 5%
- target class 8% in train, 8.5% in val
    
Accuracy: 0.9454908043430091

Precision: 0.6691176470588235

Recall: 0.7109375

F1 Score: 0.6893939393939394

#### Twelveth run
- random state 42 sampling, 2.5% of data, class weights [1, 1.7], scheduler = StepLR(optimizer, step_size=1, gamma=0.1), loss = 7.5e-5
- val_set 10% of data instead of 5%
- target class 8% in train, 8.5% in val
    
Accuracy: 0.9459339685353424

Precision: 0.6741293532338308

Recall: 0.7057291666666666

F1 Score: 0.6895674300254454