In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [2]:
train = pd.read_csv('../data/kaggle_bias/train.csv')

train = train.dropna(subset=['comment_text'])
train.reset_index(drop=True, inplace=True)
train['bi_target'] = (train['target'] >= 0.5).astype(int)

df = train[['comment_text', 'bi_target']]
df.head()

Unnamed: 0,comment_text,bi_target
0,"This is so cool. It's like, 'would you want yo...",0
1,Thank you!! This would make my life a lot less...,0
2,This is such an urgent design problem; kudos t...,0
3,Is this something I'll be able to install on m...,0
4,haha you guys are a bunch of losers.,1


In [3]:
df_sample = df.sample(frac=0.5, random_state=42)
train_df, test_df = train_test_split(df_sample, test_size=0.2, random_state=42, stratify=df_sample['bi_target'])

In [4]:
len(train_df)

721948

### Undersampling Majority class

In [5]:
rus = RandomUnderSampler(sampling_strategy=0.15, random_state=42)
train_df_rus, _ = rus.fit_resample(train_df, train_df['bi_target'])
(train_df_rus['bi_target'].sum() / len(train_df_rus) )

0.1304349788332566

In [6]:
len(train_df_rus)

443148

In [7]:
train_df_rus['bi_target'].value_counts()

bi_target
0    385346
1     57802
Name: count, dtype: int64

### Oversampling Minority Class

In [8]:
# !pip install numpy requests nlpaug

In [9]:
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm

In [10]:
nlpaw

<module 'nlpaug.augmenter.word' from 'C:\\Users\\arman\\anaconda3\\envs\\torchpower\\lib\\site-packages\\nlpaug\\augmenter\\word\\__init__.py'>

In [11]:
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm

In [12]:
def augment_sentence(sentence, aug, num_threads):
    """""""""
    Constructs a new sentence via text augmentation.
    
    Input:
        - sentence:     A string of text
        - aug:          An augmentation object defined by the nlpaug library
        - num_threads:  Integer controlling the number of threads to use if
                        augmenting text via CPU
    Output:
        - A string of text that been augmented
    """""""""
    return aug.augment(sentence, num_thread=num_threads)
    


def augment_text(df, aug, num_threads, num_times):
    """""""""
    Takes a pandas DataFrame and augments its text data.
    
    Input:
        - df:            A pandas DataFrame containing the columns:
                                - 'comment_text' containing strings of text to augment.
                                - 'bi_target' binary target variable containing 0's and 1's.
        - aug:           Augmentation object defined by the nlpaug library.
        - num_threads:   Integer controlling number of threads to use if augmenting
                         text via CPU
        - num_times:     Integer representing the number of times to augment text.
    Output:
        - df:            Copy of the same pandas DataFrame with augmented data 
                         appended to it and with rows randomly shuffled.
    """""""""
    
    # Get rows of data to augment
    to_augment = df[df['bi_target']==1]
    to_augmentX = to_augment['comment_text']
    to_augmentY = np.ones(len(to_augmentX.index) * num_times, dtype=np.int8)
    
    # Build up dictionary containing augmented data
    aug_dict = {'comment_text':[], 'bi_target':to_augmentY}
    for i in tqdm(range(num_times)):
        augX = [augment_sentence(x, aug, num_threads) for x in to_augmentX]
        aug_dict['comment_text'].extend(augX)
    
    # Build DataFrame containing augmented data
    aug_df = pd.DataFrame.from_dict(aug_dict)
    
    return pd.concat([df, aug_df], ignore_index=True).sample(frac=1, random_state=42)
    

    
# Define nlpaug augmentation object 
aug10p = nlpaw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_min=1, aug_p=0.026, action="substitute")

# Upsample minority class ('bi_target' == 1) to create a roughly 50-50 class distribution
balanced_df = augment_text(train_df_rus, aug10p, num_threads=8, num_times=3)

100%|██████████| 3/3 [5:52:52<00:00, 7057.64s/it]  


In [13]:
balanced_df.head()

Unnamed: 0,comment_text,bi_target
95478,How does Oasis differ from Unitarians?,0
194807,Nope!,0
7622,As I recall Fr Martin tweeted out that those t...,0
107145,Muckler was a bad GM...Brian Murray is a real ...,0
65899,"This ""post their names and faces all over soci...",0


In [14]:
(balanced_df['bi_target'].sum() / len(balanced_df) )

0.3750004054794876

In [15]:
len(balanced_df)

616554

In [16]:
import torch
torch.cuda.is_available()

True

In [17]:
from torch.utils.data import Dataset, DataLoader
import torch

class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['comment_text']
        label = self.dataframe.iloc[idx]['bi_target']
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=215, return_tensors="pt")
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)

In [18]:
from transformers import DistilBertTokenizer
# from sklearn.model_selection import train_test_split

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_dataset = MyDataset(balanced_df, tokenizer)
val_dataset = MyDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [19]:
from transformers import DistilBertForSequenceClassification, DistilBertConfig

config = DistilBertConfig(
    n_layers=6,  # Number of hidden layers
    dim=768,  # Dimensionality of the encoder layers and the pooler layer
    hidden_dim=3072,  # Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder
    n_heads=12,  # Number of attention heads for each attention layer in the Transformer encoder
    dropout=0.2,  # Dropout probability for the dropout layers
    attention_dropout=0.2,  # Dropout probability for the attention layers
    num_labels=2
)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# makes the model run on the GPU instead of CPU
model.cuda()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.2, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.2, inplace=False)
 

In [20]:
from transformers import AdamW
from tqdm.auto import tqdm
from torch.optim.lr_scheduler import StepLR

# # class weight for 5% target class
# class_weights = [1, 1.01]

# # convert class weight to tensor
# class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).cuda()


# AdamW optimizer is apparently really good for DistilBERT?  Will write more in docs
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

# Training loop
model.train()
for epoch in range(3):  
    scheduler.step()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for input_ids, attention_mask, labels in progress_bar:
        # Move the training to the GPU
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        labels = labels.cuda()

        # Set gradients to zero for training
        model.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        # loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        # loss = loss_fct(outputs.logits, labels)

        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'loss': loss.item()})



Epoch 1:   0%|          | 0/19268 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/19268 [00:00<?, ?it/s]

Epoch 3:   0%|          | 0/19268 [00:00<?, ?it/s]

In [22]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()  # Transformers built-in model evaluation kit 
true_labels = np.array([])
all_pred_labels = []  # Store predictions for each threshold

# Define a list of threshold values to iterate over
thresholds = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for threshold in thresholds:
    true_labels = np.array([])
    pred_labels = np.array([])

    with torch.no_grad():  # Disable gradient calculation
        for input_ids, attention_mask, labels in val_loader:
            # Move tensors to the GPU
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()

            # Forward pass, get logit predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Apply threshold to logits
            logits = outputs.logits.detach().cpu().numpy()
            pred_probs = torch.softmax(torch.tensor(logits), dim=1)
            pred_labels_thresholded = (pred_probs[:, 1] > threshold).numpy()  # Assuming binary classification

            # Move labels to CPU
            label_ids = labels.to('cpu').numpy()

            # Store predictions and true labels
            true_labels = np.concatenate((true_labels, label_ids))
            pred_labels = np.concatenate((pred_labels, pred_labels_thresholded))

    all_pred_labels.append(pred_labels)

# Convert list of prediction arrays to numpy array
all_pred_labels = np.array(all_pred_labels)

# Calculate metrics for each threshold
for i, threshold in enumerate(thresholds):
    pred_labels = all_pred_labels[i]
    accuracy = accuracy_score(true_labels, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')

    print(f'Threshold: {threshold}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print('///////////////////////')

KeyboardInterrupt: 

In [16]:
# X_train = balanced_df['comment_text']
# y_train = balanced_df['bi_target']
# X_test = test_df['comment_text']
# y_test = test_df['bi_target']