In [3]:
import time
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

In [4]:
# Load the data
df = pd.read_csv('prepared_data.csv')
df = df[['cleaned_text', 'LabelMapped']]
df['LabelMapped'] = df['LabelMapped'].map({-1: 0, 0: 1, 1: 2})

In [5]:
df

Unnamed: 0,cleaned_text,LabelMapped
0,arrived broken manufacturer defect two of the ...,0
1,the cabinet dot were all detached from backing...,0
2,i received my first order of this product and ...,0
3,this product is a piece of shit do not buy doe...,0
4,went through in one day doesnt fit correct and...,0
...,...,...
255077,racaltosk ok good to know punting at metlife i...,2
255078,everyone who sat around me at metlife was so a...,1
255079,what giants or niners fans would wanna go to t...,1
255080,anybody want a ticket for tomorrow colombia vs...,2


In [9]:
# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['cleaned_text'], df['LabelMapped'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, max_length=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_texts(train_texts)
val_encodings = encode_texts(val_texts)
test_encodings = encode_texts(test_texts)

train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [14]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_steps, num_training_steps=total_steps)

early_stopping_patience = 3
best_val_loss = float('inf')
patience_counter = 0

scaler = torch.cuda.amp.GradScaler()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def train_model(model, train_dataloader, optimizer, scheduler, scaler):
    model.train()
    total_loss = 0
    start_time = time.time()
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
        total_loss += loss.item()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
    end_time = time.time()
    avg_train_loss = total_loss / len(train_dataloader)
    epoch_time = end_time - start_time
    return avg_train_loss, epoch_time

def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            with torch.cuda.amp.autocast():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct += (preds == b_labels).sum().item()
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / len(dataloader.dataset)
    return avg_loss, accuracy

total_training_time = 0
for epoch in range(epochs):
    train_loss, epoch_time = train_model(model, train_dataloader, optimizer, scheduler, scaler)
    val_loss, accuracy = evaluate_model(model, val_dataloader)
    total_training_time += epoch_time
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, Time: {epoch_time:.2f}s')
    
    # Save the model after each epoch
    model.save_pretrained(f'bert_model_epoch_{epoch+1}')
    tokenizer.save_pretrained(f'bert_model_epoch_{epoch+1}')
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        model.save_pretrained('bert_model_best')
        tokenizer.save_pretrained('bert_model_best')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

print(f'Total training time: {total_training_time:.2f}s')

# Load the best model for final evaluation
model = BertForSequenceClassification.from_pretrained('bert_model_best')
test_loss, accuracy = evaluate_model(model, test_dataloader)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

# Save the final model
model.save_pretrained('bert_model_final')
tokenizer.save_pretrained('bert_model_final')



Epoch 1/5, Train Loss: 0.6453, Val Loss: 0.5661, Accuracy: 0.7527, Time: 28950.26s
Epoch 2/5, Train Loss: 0.5155, Val Loss: 0.5487, Accuracy: 0.7654, Time: 28996.82s
Epoch 3/5, Train Loss: 0.4140, Val Loss: 0.5952, Accuracy: 0.7581, Time: 28757.12s
Epoch 4/5, Train Loss: 0.3196, Val Loss: 0.6786, Accuracy: 0.7521, Time: 29049.76s
Epoch 5/5, Train Loss: 0.2500, Val Loss: 0.7781, Accuracy: 0.7504, Time: 29569.60s
Early stopping triggered
Total training time: 145323.55s
Test Loss: 0.5424
Test Accuracy: 0.7689


('bert_model_final/tokenizer_config.json',
 'bert_model_final/special_tokens_map.json',
 'bert_model_final/vocab.txt',
 'bert_model_final/added_tokens.json')