In [None]:
!pip install pytorch_lightning
!pip install transformers
!pip install torchmetrics

In [None]:
import torch.nn as nn
import torch
import pandas as pd
import pytorch_lightning as pl
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW, Adam
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import seed_everything
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split
from torchmetrics.classification import F1Score, Accuracy

random_seed = 1270
seed_everything(random_seed)

print(f"Random seed used: {random_seed}")
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

MAX_LENGTH = 160
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
NUM_EPOCHS = 2
LEARNING_RATE = 3e-5

print(f"LR: {LEARNING_RATE}")

print("Loading data...")
# Read the datasets from their respective CSV files
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print('Training Set Shape = {}'.format(train_df.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(train_df.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(test_df.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(test_df.memory_usage().sum() / 1024**2))

# Split the original training data into a new training set and a validation set
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['target'])

train_df["length"] = train_df["text"].apply(lambda x : len(x))
test_df["length"] = test_df["text"].apply(lambda x : len(x))

print("Train Length Stat")
print(train_df["length"].describe())
print()

print("Test Length Stat")
print(test_df["length"].describe())
print()

# Custom Dataset class for DataLoader
print("Preparing custom dataset...")
class DisasterTweetDataset(Dataset):
    def __init__(self, df, tokenizer, is_test=False):
        self.texts = df['text'].tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        if not is_test:
            self.labels = df['target'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LENGTH)
        
        if self.is_test:
            return {
                'input_ids': torch.tensor(encoding['input_ids']), 
                'attention_mask': torch.tensor(encoding['attention_mask'])
            }
        else:
            label = self.labels[idx]
            return {
                'input_ids': torch.tensor(encoding['input_ids']), 
                'attention_mask': torch.tensor(encoding['attention_mask']), 
                'labels': torch.tensor(label)
            }

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create the datasets
train_dataset = DisasterTweetDataset(train_df, tokenizer)
val_dataset = DisasterTweetDataset(val_df, tokenizer)
test_dataset = DisasterTweetDataset(test_df, tokenizer, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)

class DisasterTweetClassifier(pl.LightningModule):
    def __init__(self, lr):
        super().__init__()
        num_labels = 2  # Binary classification: target is either 0 or 1
        self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
        self.loss = nn.CrossEntropyLoss()
        self.lr = lr  # set learning rate
        self.val_f1 = F1Score(task='binary', num_classes=num_labels, average='macro')
        self.train_accuracy = Accuracy(task='binary', num_classes=num_labels)
        self.val_accuracy = Accuracy(task='binary', num_classes=num_labels)
        self.test_outputs = []  # Initialize an empty list to store test outputs
        self.train_losses = []  # Initialize an empty list to store training losses
        self.train_f1_scores = []  # Initialize an empty list to store training F1 scores
        
    def forward(self, inputs):
        outputs = self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).logits
        return outputs

    def training_step(self, batch, batch_idx):
        inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
        labels = batch['labels']
        outputs = self.forward(inputs)
        loss = self.loss(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        f1_score = self.val_f1(preds, labels).item()  # Compute F1 score for the batch
        train_acc = self.train_accuracy(preds, labels)
        self.log('train_acc', train_acc, prog_bar=True)
        self.train_f1_scores.append(f1_score)  # Append F1 score for this batch
        self.log('train_loss', loss, prog_bar=True)
        self.log('f1', f1_score, prog_bar=True)
        return {'loss': loss}

    def on_train_epoch_end(self):
        if self.train_losses:  # Check if list is not empty
            avg_loss = torch.stack(self.train_losses).mean()  # Compute the average loss
            self.log('avg_train_loss', avg_loss)
        if self.train_f1_scores:  # Check if list is not empty
            avg_f1 = sum(self.train_f1_scores) / len(self.train_f1_scores)  # Compute the average F1 score
            self.log('avg_train_f1', torch.tensor(avg_f1))  # Log the average F1 score
        self.train_losses = []  # Clear the list for the next epoch
        self.train_f1_scores = []  # Clear the F1 scores for the next epoch

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.lr, weight_decay=0.01)
        total_steps = (len(train_dataset) // TRAIN_BATCH_SIZE) * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=150, num_training_steps=total_steps)
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    def validation_step(self, batch, batch_idx):
        inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
        labels = batch['labels']
        outputs = self.forward(inputs)
        loss = self.loss(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        f1_value = self.val_f1(preds, labels)  # Compute F1 value
        val_acc = self.val_accuracy(preds, labels)
        # Log validation loss, accuracy and F1 score
        self.log('val_acc', val_acc, prog_bar=True)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_f1', f1_value, prog_bar=True)  # Log the computed F1 value
        return {'val_loss': loss}

    def test_step(self, batch, batch_idx):
        inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
        outputs = self.forward(inputs)
        preds = torch.argmax(outputs, dim=1)
        self.test_outputs.append({'logits': outputs})  # Append logits to test_outputs
        return {'logits': outputs}

    def on_test_epoch_end(self):
        all_logits = torch.cat([x['logits'] for x in self.test_outputs], dim=0)
        all_preds = torch.argmax(all_logits, dim=1)
        ids = test_df['id'].values  # Get the ids from the test DataFrame
        submission_df = pd.DataFrame({'id': ids, 'target': all_preds.cpu().numpy()})  # Create a DataFrame for submission
        submission_df.to_csv('submission.csv', index=False)  # Save the DataFrame to a CSV file
        self.test_outputs = []  # Clear the test_outputs list for future test runs
        
    def test_step_end(self, batch_parts):
        if not hasattr(self, 'test_outputs'):
            self.test_outputs = []
        self.test_outputs.append(batch_parts)
        
# Initialize the model
print("Initializing model...")
model = DisasterTweetClassifier(lr=LEARNING_RATE)

# Initialize Trainer
print("Initializing trainer...")
trainer = pl.Trainer(
    max_epochs=NUM_EPOCHS,
    accelerator="auto",
)

#Wrap the training phase with try-except for error handling
try:
    # Train the model
    print("Training the model...")
    #num_workers=1
    trainer.fit(model, DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=1), DataLoader(val_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=1))
except RuntimeError as e:
    if "out of memory" in str(e):
        print("ERROR: Out of memory")
        # Add code here to handle OOM specifically, like freeing up resources or reducing batch size
    else:
        print("An unexpected error occurred during training.")
        print(str(e))
except Exception as e:
    print("An unexpected error occurred during training.")
    print(str(e))

print("Testing the model...")
trainer.test(model, test_loader)