# Training pretrained model BERT

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("../../data/text/combined_cleaned_multilabel.csv")
texts = df['text'].tolist()
labels = df[['is_happy', 'is_surprised', 'is_neutral', 'is_sad', 'is_fear', 'is_angry', 'is_disgust']].values

In [10]:
# len(max(texts, key=len))
max(texts, key=len)

'jacob luxury haircut rarely need style not set aside hour half get ready morning every day wake head straight shower every second day wash hair hair wash day frequently need wash hair twice get really oily usually put conditioner rise hair long seldom manage take shower twenty minute afterwards often put pot coffee get dress wait brew take long time get dress morning every remember choose outfit night usually morning get dress take half hour time hair semi dry style hair time time put hair oftentimes bloody straight texture hair regularly flat iron keep freeze another twenty minute daily makeup routine'

In [2]:
# Tokenize text data
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [3]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Parameters
BATCH_SIZE = 64
MAX_LEN = 64  # Adjust based on your data length

# Create DataLoader for training and validation sets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [4]:
def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, batch in enumerate(data_loader, start=1):  # start=1 to start counting from 1
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Move logits and labels back to CPU for computation if needed
        logits = logits.detach().cpu()
        labels = labels.cpu()

        # Apply sigmoid to convert logits to probabilities
        probabilities = torch.sigmoid(logits)
        predictions = (probabilities >= 0.5).int()  # threshold of 0.5 for multi-label classification

        # Count correct predictions
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += torch.numel(labels)  # total number of label predictions

        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        # Print batch progress
        if batch_idx % 10 == 0 or batch_idx == len(data_loader):
            print(f"Batch {batch_idx}/{len(data_loader)}: Loss = {loss.item()}")

    accuracy = correct_predictions / total_predictions  # Compute accuracy across labels
    return np.mean(losses), accuracy


# Validation function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            logits = logits.cpu()
            labels = labels.cpu()
            
            probabilities = torch.sigmoid(logits)
            predictions = (probabilities >= 0.5).int()

            correct_predictions += (predictions == labels).sum().item()
            total_predictions += torch.numel(labels)

            losses.append(loss.item())

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

In [5]:
import torch
torch.cuda.is_available()
# print(torch.version.cuda)

True

In [5]:
num_batches = len(train_loader)
print(f"Number of batches: {num_batches}")


Number of batches: 1843


In [6]:
# Training loop
EPOCHS = 3
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=0.1, correct_bias=False)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss: {train_loss}')
    
    val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss: {val_loss}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/3
Batch 10/1843: Loss = 5.276866912841797
Batch 20/1843: Loss = 18.955162048339844
Batch 30/1843: Loss = 15.476664543151855
Batch 40/1843: Loss = 8.097869873046875
Batch 50/1843: Loss = 5.418328762054443
Batch 60/1843: Loss = 3.657914161682129
Batch 70/1843: Loss = 6.575597763061523
Batch 80/1843: Loss = 6.407071113586426
Batch 90/1843: Loss = 6.3077263832092285
Batch 100/1843: Loss = 5.356635093688965
Batch 110/1843: Loss = 4.862685680389404
Batch 120/1843: Loss = 5.048511028289795
Batch 130/1843: Loss = 3.5523171424865723
Batch 140/1843: Loss = 3.511007785797119
Batch 150/1843: Loss = 4.490808010101318
Batch 160/1843: Loss = 3.603095531463623
Batch 170/1843: Loss = 4.631044387817383
Batch 180/1843: Loss = 4.817263603210449
Batch 190/1843: Loss = 3.8401567935943604
Batch 200/1843: Loss = 4.120916366577148
Batch 210/1843: Loss = 3.2412285804748535
Batch 220/1843: Loss = 2.1110756397247314
Batch 230/1843: Loss = 3.5433154106140137
Batch 240/1843: Loss = 3.3087961673736572
Batch 

In [7]:
torch.save(model.state_dict(), 'bert_model.pth')
tokenizer.save_pretrained('./tokenizer')

('./tokenizer\\tokenizer_config.json',
 './tokenizer\\special_tokens_map.json',
 './tokenizer\\vocab.txt',
 './tokenizer\\added_tokens.json')

In [9]:
model.save_pretrained('./bert_full_model')

## Balanced dataset

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("../../data/text/combined_cleaned_balanced_multilabel.csv")
texts = df['text'].tolist()
labels = df[['is_happy', 'is_surprised', 'is_neutral', 'is_sad', 'is_fear', 'is_angry', 'is_disgust']].values

In [2]:
# Tokenize text data
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [3]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Parameters
BATCH_SIZE = 64
MAX_LEN = 64  # Adjust based on your data length

# Create DataLoader for training and validation sets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [4]:
def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, batch in enumerate(data_loader, start=1):  # start=1 to start counting from 1
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Move logits and labels back to CPU for computation if needed
        logits = logits.detach().cpu()
        labels = labels.cpu()

        # Apply sigmoid to convert logits to probabilities
        probabilities = torch.sigmoid(logits)
        predictions = (probabilities >= 0.5).int()  # threshold of 0.5 for multi-label classification

        # Count correct predictions
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += torch.numel(labels)  # total number of label predictions

        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        # Print batch progress
        if batch_idx % 10 == 0 or batch_idx == len(data_loader):
            print(f"Batch {batch_idx}/{len(data_loader)}: Loss = {loss.item()}")

    accuracy = correct_predictions / total_predictions  # Compute accuracy across labels
    return np.mean(losses), accuracy


# Validation function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            logits = logits.cpu()
            labels = labels.cpu()
            
            probabilities = torch.sigmoid(logits)
            predictions = (probabilities >= 0.5).int()

            correct_predictions += (predictions == labels).sum().item()
            total_predictions += torch.numel(labels)

            losses.append(loss.item())

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

In [5]:
# Training loop
EPOCHS = 3
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=0.1, correct_bias=False)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss: {train_loss}')
    
    val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss: {val_loss}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batch 10/714: Loss = 38.05537414550781
Batch 20/714: Loss = 18.452350616455078
Batch 30/714: Loss = 18.053342819213867
Batch 40/714: Loss = 11.372179985046387
Batch 50/714: Loss = 16.199068069458008
Batch 60/714: Loss = 7.108647346496582
Batch 70/714: Loss = 6.7043046951293945
Batch 80/714: Loss = 5.043370246887207
Batch 90/714: Loss = 4.1977057456970215
Batch 100/714: Loss = 4.921322822570801
Batch 110/714: Loss = 5.709242820739746
Batch 120/714: Loss = 4.077258586883545
Batch 130/714: Loss = 3.267163038253784
Batch 140/714: Loss = 4.078359603881836
Batch 150/714: Loss = 3.506300449371338
Batch 160/714: Loss = 5.591334819793701
Batch 170/714: Loss = 4.715604305267334
Batch 180/714: Loss = 4.170882225036621
Batch 190/714: Loss = 3.8558523654937744
Batch 200/714: Loss = 3.725015878677368
Batch 210/714: Loss = 3.773198127746582
Batch 220/714: Loss = 2.9365861415863037
Batch 230/714: Loss = 3.184500217437744
Batch 240/714: Loss = 3.6228551864624023
Batch 250/714: Loss = 3.363974094390869


In [6]:
torch.save(model.state_dict(), 'bert_model_2.pth')
tokenizer.save_pretrained('./tokenizer_2')
model.save_pretrained('./bert_full_model_2')

## Testing

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("../../data/text/combined_cleaned_balanced_multilabel.csv")
texts = df['text'].tolist()
labels = df[['is_happy', 'is_surprised', 'is_neutral', 'is_sad', 'is_fear', 'is_angry', 'is_disgust']].values

In [2]:
# Tokenize text data
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [4]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('outputs/tokenizer_2')

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Parameters
BATCH_SIZE = 64
MAX_LEN = 64  # Adjust based on your data length

# Create DataLoader for training and validation sets
dataset = TextDataset(texts, labels, tokenizer, MAX_LEN)


loader = DataLoader(dataset, batch_size=BATCH_SIZE)

In [7]:
# Validation function
from sklearn.metrics import precision_recall_fscore_support

def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    all_labels = []
    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            logits = logits.cpu()
            labels = labels.cpu()
            
            probabilities = torch.sigmoid(logits)
            predictions = (probabilities >= 0.5).int()

            all_labels.append(labels)
            all_predictions.append(predictions)
            all_probabilities.append(probabilities)

            correct_predictions += (predictions == labels).sum().item()
            total_predictions += torch.numel(labels)

            losses.append(loss.item())

    # Combine all batches
    all_labels = torch.cat(all_labels, dim=0).numpy()
    all_predictions = torch.cat(all_predictions, dim=0).numpy()
    all_probabilities = torch.cat(all_probabilities, dim=0).numpy()

    # Calculate overall accuracy
    accuracy = correct_predictions / total_predictions

    # Calculate precision, recall, and F1 score per label
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average=None)

    # Summarize results
    label_names = ['is_happy', 'is_surprised', 'is_neutral', 'is_sad', 'is_fear', 'is_angry', 'is_disgust']
    print(f"Overall Accuracy: {accuracy:.4f}")
    print("Per-label Metrics:")
    for i, label in enumerate(label_names):
        print(f"{label}: Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1: {f1[i]:.4f}")

    # Summarize probabilities
    avg_probabilities = all_probabilities.mean(axis=0)
    for i, label in enumerate(label_names):
        print(f"{label}: Average Probability: {avg_probabilities[i]:.4f}")

    return np.mean(losses), accuracy


In [8]:
# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('outputs/bert_full_model_2', num_labels=7)
model = model.to(device)

loss = eval_model(model, loader, device)
print(f'Validation loss: {loss}')

Overall Accuracy: 0.8571
Per-label Metrics:
is_happy: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_surprised: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_neutral: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_sad: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_fear: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_angry: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_disgust: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
is_happy: Average Probability: 0.0000
is_surprised: Average Probability: 0.0012
is_neutral: Average Probability: 0.0000
is_sad: Average Probability: 0.0000
is_fear: Average Probability: 0.0054
is_angry: Average Probability: 0.0000
is_disgust: Average Probability: 0.0000
Validation loss: (3.76786752806918, 0.8571428571428571)


  _warn_prf(average, modifier, msg_start, len(result))


IT PREDICTS 0 AT ALL TIMES THAT'S WHY IT HAS ACCURACY 85%!!!

## Balanced BUT not multilabel

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

emotion_to_idx = {
    'happy': 0,
    'surprised': 1,
    'neutral': 2,
    'sad': 3,
    'fear': 4,
    'angry': 5,
    'disgust': 6
}

# Load your dataset
df = pd.read_csv("../../data/text/combined_cleaned_balanced_dataset.csv")
texts = df['text'].tolist()
labels = df['emotion'].map(emotion_to_idx).values  # Map emotions to integers

In [2]:
# Update Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)  # Single integer label
        }

In [3]:
# Update Training Loop
def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, batch in enumerate(data_loader, start=1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Calculate predictions
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if scheduler is not None: 
            scheduler.step()

        if batch_idx % 100 == 0 or batch_idx == len(data_loader):
            if scheduler is not None: 
                print(f"Batch {batch_idx}/{len(data_loader)}: Loss = {loss.item()}, LR: {scheduler.get_last_lr()[0]}")
            else:
                print(f"Batch {batch_idx}/{len(data_loader)}: Loss = {loss.item()}")

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

# Update Validation Function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

            losses.append(loss.item())

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

In [4]:
MAX_LEN = 128
BATCH_SIZE = 32
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Create DataLoader for training and validation sets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [5]:
from transformers import BertConfig
from transformers import get_scheduler

EPOCHS = 10
learning_rate = 0.00003
num_training_steps = len(train_loader) * EPOCHS
num_warmup_steps = int(0.1 * num_training_steps)

# Initialize model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=7, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
# optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    name="linear",  # Other options: linear, cosine, polynomial, constant
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,  # Total steps
)

best_val_loss = float('inf')
patience = 3  # Stop after 2 epochs without improvement
counter = 0

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, device, scheduler)
    # train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, device)
    print(f"Train loss: {train_loss}, Accuracy: {train_accuracy}")
    val_loss, val_accuracy = eval_model(model, val_loader, device)
    print(f"Validation loss: {val_loss}, Accuracy: {val_accuracy}")
    
    # scheduler.step()
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        print("Early stopping!")
        break


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batch 100/1427: Loss = 1.9069859981536865, LR: 2.102312543798178e-06
Batch 200/1427: Loss = 1.8750369548797607, LR: 4.204625087596356e-06
Batch 300/1427: Loss = 1.90114164352417, LR: 6.306937631394534e-06
Batch 400/1427: Loss = 1.847691535949707, LR: 8.409250175192711e-06
Batch 500/1427: Loss = 1.5544053316116333, LR: 1.051156271899089e-05
Batch 600/1427: Loss = 1.7477879524230957, LR: 1.2613875262789068e-05
Batch 700/1427: Loss = 1.6349784135818481, LR: 1.4716187806587247e-05
Batch 800/1427: Loss = 1.2729984521865845, LR: 1.6818500350385423e-05
Batch 900/1427: Loss = 1.231662392616272, LR: 1.8920812894183602e-05
Batch 1000/1427: Loss = 1.4991635084152222, LR: 2.102312543798178e-05
Batch 1100/1427: Loss = 1.3357008695602417, LR: 2.312543798177996e-05
Batch 1200/1427: Loss = 1.3269368410110474, LR: 2.5227750525578136e-05
Batch 1300/1427: Loss = 1.5366488695144653, LR: 2.7330063069376315e-05
Batch 1400/1427: Loss = 1.4038002490997314, LR: 2.9432375613174494e-05
Batch 1427/1427: Loss = 1.

21/11/24 13:01 (45min)

lr=0.00003 epochs=10
Train loss: 0.11693784097351849, Accuracy: 0.9593307198703489 
Validation loss: 2.396488595075634, Accuracy: 0.560791871058164 
comment: Increase Dropout, Add Weight Decay, add Early Stopping, Add a Learning Rate Scheduler

21/11/24 14:43 (33min)

lr=0.00003 epochs=10, drop_out=30%, weight_decay=0.01
Train loss: 0.8929176434362862, Accuracy: 0.6666958673704036
Validation loss: 1.203592354343051, Accuracy: 0.5786615276804485

In [6]:
tokenizer.save_pretrained('outputs/tokenizer_3')
model.save_pretrained('outputs/bert_full_model_3')