In [1]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import f1_score, accuracy_score
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = load_dataset("json", 
                     data_files='./train_preprocessed_segments.jsonl', 
                     streaming=False, split="train")
val = load_dataset("json", 
                     data_files='./val_preprocessed_segments.jsonl', 
                     streaming=False, split="train")
train
val

Dataset({
    features: ['pornographic-content', 'violence', 'death', 'sexual-assault', 'abuse', 'blood', 'suicide', 'pregnancy', 'child-abuse', 'incest', 'underage', 'homophobia', 'self-harm', 'dying', 'kidnapping', 'mental-illness', 'dissection', 'eating-disorders', 'abduction', 'body-hatred', 'childbirth', 'racism', 'sexism', 'miscarriages', 'transphobia', 'abortion', 'fat-phobia', 'animal-death', 'ableism', 'classism', 'misogyny', 'animal-cruelty', 'segment', 'attention_mask'],
    num_rows: 117824
})

In [3]:
import random
random.seed(9765)

label_columns = ['pornographic-content', 'violence', 'death', 'sexual-assault', 'abuse', 'blood',
                 'suicide', 'pregnancy', 'child-abuse', 'incest', 'underage', 'homophobia',
                 'self-harm', 'dying', 'kidnapping', 'mental-illness', 'dissection',
                 'eating-disorders', 'abduction', 'body-hatred', 'childbirth', 'racism',
                 'sexism', 'miscarriages', 'transphobia', 'abortion', 'fat-phobia',
                 'animal-death', 'ableism', 'classism', 'misogyny', 'animal-cruelty']

label_columns_without_mc = ['violence', 'death', 'sexual-assault', 'abuse', 'blood',
                 'suicide', 'pregnancy', 'child-abuse', 'incest', 'underage', 'homophobia',
                 'self-harm', 'dying', 'kidnapping', 'mental-illness', 'dissection',
                 'eating-disorders', 'abduction', 'body-hatred', 'childbirth', 'racism',
                 'sexism', 'miscarriages', 'transphobia', 'abortion', 'fat-phobia',
                 'animal-death', 'ableism', 'classism', 'misogyny', 'animal-cruelty']

# Define a function to filter the data
def filter_data(example):
    if example['pornographic-content'] == 1 and sum({key: value for key, value in example.items() if key in label_columns_without_mc}.values()) == 0:
        # Randomly keep x% of such examples
        return random.random() <= 0.45
    # Keep all other rows
    return True

# Apply the filter function to the dataset
filtered_dataset = train.filter(filter_data)

# Print some information about the filtered dataset
print(f"Original dataset size: {len(train)}")
print(f"Filtered dataset size: {len(filtered_dataset)}")

train = filtered_dataset
train

Original dataset size: 2125001
Filtered dataset size: 1470507


Dataset({
    features: ['pornographic-content', 'violence', 'death', 'sexual-assault', 'abuse', 'blood', 'suicide', 'pregnancy', 'child-abuse', 'incest', 'underage', 'homophobia', 'self-harm', 'dying', 'kidnapping', 'mental-illness', 'dissection', 'eating-disorders', 'abduction', 'body-hatred', 'childbirth', 'racism', 'sexism', 'miscarriages', 'transphobia', 'abortion', 'fat-phobia', 'animal-death', 'ableism', 'classism', 'misogyny', 'animal-cruelty', 'segment', 'attention_mask'],
    num_rows: 1470507
})

In [4]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
fp16 = True if torch.cuda.is_available() else False

cuda


In [5]:
from transformers import AdamW, BigBirdTokenizer ,get_linear_schedule_with_warmup, BigBirdForSequenceClassification, Trainer, TrainingArguments,EvalPrediction, AutoTokenizer, BertForSequenceClassification, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                num_labels=32,
                problem_type="multi_label_classification",
                return_dict=True)

model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:
class_freq = []
for label in label_columns:
    class_freq.append(np.sum(val[label]))
class_freq

[92768,
 11607,
 7541,
 12527,
 8960,
 5859,
 2845,
 4794,
 2845,
 5137,
 3556,
 2094,
 1763,
 3122,
 1798,
 1527,
 739,
 469,
 472,
 506,
 321,
 144,
 230,
 181,
 166,
 148,
 296,
 107,
 136,
 70,
 110,
 96]

In [7]:
from util_loss import ResampleLoss
from collections import Counter
np.set_printoptions(suppress=True)

train_num = len(train)
print(train_num)
class_freq = class_freq
print(class_freq)

loss_func = ResampleLoss(reweight_func='rebalance', loss_weight=1.0,
                         focal=dict(focal=True, alpha=0.5, gamma=2),
                         logit_reg=dict(init_bias=0.05, neg_scale=2.0),
                         map_param=dict(alpha=0.1, beta=10.0, gamma=0.9), 
                         class_freq=class_freq, train_num=train_num)
loss_func

1470507
[92768, 11607, 7541, 12527, 8960, 5859, 2845, 4794, 2845, 5137, 3556, 2094, 1763, 3122, 1798, 1527, 739, 469, 472, 506, 321, 144, 230, 181, 166, 148, 296, 107, 136, 70, 110, 96]


ResampleLoss()

In [8]:
from tqdm import tqdm
import os
from torch.cuda.amp import autocast, GradScaler
from sklearn.utils import class_weight

batch_size = 32
epochs = 1
learning_rate = 2e-5

#criterion = torch.nn.BCEWithLogitsLoss()

# Convert your PyTorch tensors to DataLoader
train_loader = DataLoader(train, batch_size=batch_size)
val_loader = DataLoader(val, batch_size=batch_size)

model.load_state_dict(torch.load('saved_models_db_loss/model_epoch_1.pt')) # load model if wanted

optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

save_dir = "saved_models_db_loss"
os.makedirs(save_dir, exist_ok=True)

scaler = GradScaler()

### code for saving
def save_model(model, epoch):
    model_save_path = os.path.join(save_dir, f"model_epoch_{epoch + 1}.pt")
    torch.save(model.state_dict(), model_save_path)

### code to evaluate model
def evaluate(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = torch.stack(batch['segment'], dim=1).to(device)
            attention_mask = torch.stack(batch['attention_mask'], dim=1).to(device)
            labels = torch.stack([label_tensor for label_tensor in {key: value for key, value in batch.items() if key in label_columns}.values()], dim=1).float().to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = (logits > 0).int()  # Convert logits to binary predictions
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    accuracy = accuracy_score(all_labels, all_preds)
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    acc = accuracy_score(all_labels, all_preds)
    return acc, f1_micro, f1_macro

for epoch in range(epochs):
    train_loss = 0.0
    model.train()
    
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as pbar:
        for batch in train_loader:
            input_ids = torch.stack(batch['segment'], dim=1).to(device)
            attention_mask = torch.stack(batch['attention_mask'], dim=1).to(device)
            labels = torch.stack([label_tensor for label_tensor in {key: value for key, value in batch.items() if key in label_columns}.values()], dim=1).float().to(device)
            
            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = loss_func(outputs.logits.view(-1, 32), labels.type_as(outputs.logits).view(-1, 32))
            
            #loss = criterion(outputs.logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item()
            pbar.update(1)  # Update the progress bar
    train_loss /= len(train_loader)
    
    acc, f1_micro, f1_macr = evaluate(model, val_loader)

    save_model(model, epoch + 1)
    
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation F1 Micro: {f1_micro:.4f}")
    print(f"Validation F1 Macro: {f1_macr:.4f}")
    print("---------------------------")


Epoch 1/1: 100%|████████████████████████████████████████████████████████████| 45954/45954 [9:08:31<00:00,  1.40batch/s]


Epoch 1/1
Train Loss: 0.0031
Validation Accuracy: 0.5203
Validation F1 Micro: 0.6912
Validation F1 Macro: 0.2685
---------------------------
