# Training pretrained model BERT

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("../../data/text/combined_cleaned_multilabel.csv")
texts = df['text'].tolist()
labels = df[['is_happy', 'is_surprised', 'is_neutral', 'is_sad', 'is_fear', 'is_angry', 'is_disgust']].values

In [10]:
# len(max(texts, key=len))
max(texts, key=len)

'jacob luxury haircut rarely need style not set aside hour half get ready morning every day wake head straight shower every second day wash hair hair wash day frequently need wash hair twice get really oily usually put conditioner rise hair long seldom manage take shower twenty minute afterwards often put pot coffee get dress wait brew take long time get dress morning every remember choose outfit night usually morning get dress take half hour time hair semi dry style hair time time put hair oftentimes bloody straight texture hair regularly flat iron keep freeze another twenty minute daily makeup routine'

In [2]:
# Tokenize text data
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [3]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Parameters
BATCH_SIZE = 64
MAX_LEN = 64  # Adjust based on your data length

# Create DataLoader for training and validation sets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [4]:
def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, batch in enumerate(data_loader, start=1):  # start=1 to start counting from 1
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Move logits and labels back to CPU for computation if needed
        logits = logits.detach().cpu()
        labels = labels.cpu()

        # Apply sigmoid to convert logits to probabilities
        probabilities = torch.sigmoid(logits)
        predictions = (probabilities >= 0.5).int()  # threshold of 0.5 for multi-label classification

        # Count correct predictions
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += torch.numel(labels)  # total number of label predictions

        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        # Print batch progress
        if batch_idx % 10 == 0 or batch_idx == len(data_loader):
            print(f"Batch {batch_idx}/{len(data_loader)}: Loss = {loss.item()}")

    accuracy = correct_predictions / total_predictions  # Compute accuracy across labels
    return np.mean(losses), accuracy


# Validation function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            logits = logits.cpu()
            labels = labels.cpu()
            
            probabilities = torch.sigmoid(logits)
            predictions = (probabilities >= 0.5).int()

            correct_predictions += (predictions == labels).sum().item()
            total_predictions += torch.numel(labels)

            losses.append(loss.item())

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

In [5]:
import torch
torch.cuda.is_available()
# print(torch.version.cuda)

True

In [5]:
num_batches = len(train_loader)
print(f"Number of batches: {num_batches}")


Number of batches: 1843


In [6]:
# Training loop
EPOCHS = 3
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=0.1, correct_bias=False)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss: {train_loss}')
    
    val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss: {val_loss}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/3
Batch 10/1843: Loss = 5.276866912841797
Batch 20/1843: Loss = 18.955162048339844
Batch 30/1843: Loss = 15.476664543151855
Batch 40/1843: Loss = 8.097869873046875
Batch 50/1843: Loss = 5.418328762054443
Batch 60/1843: Loss = 3.657914161682129
Batch 70/1843: Loss = 6.575597763061523
Batch 80/1843: Loss = 6.407071113586426
Batch 90/1843: Loss = 6.3077263832092285
Batch 100/1843: Loss = 5.356635093688965
Batch 110/1843: Loss = 4.862685680389404
Batch 120/1843: Loss = 5.048511028289795
Batch 130/1843: Loss = 3.5523171424865723
Batch 140/1843: Loss = 3.511007785797119
Batch 150/1843: Loss = 4.490808010101318
Batch 160/1843: Loss = 3.603095531463623
Batch 170/1843: Loss = 4.631044387817383
Batch 180/1843: Loss = 4.817263603210449
Batch 190/1843: Loss = 3.8401567935943604
Batch 200/1843: Loss = 4.120916366577148
Batch 210/1843: Loss = 3.2412285804748535
Batch 220/1843: Loss = 2.1110756397247314
Batch 230/1843: Loss = 3.5433154106140137
Batch 240/1843: Loss = 3.3087961673736572
Batch 

In [7]:
torch.save(model.state_dict(), 'bert_model.pth')
tokenizer.save_pretrained('./tokenizer')

('./tokenizer\\tokenizer_config.json',
 './tokenizer\\special_tokens_map.json',
 './tokenizer\\vocab.txt',
 './tokenizer\\added_tokens.json')

In [9]:
model.save_pretrained('./bert_full_model')