In [None]:
!pip install --upgrade pytorch_lightning
!pip install --upgrade transformers

In [None]:
import torch.nn as nn
import torch
import json
import os
import pandas as pd
import pytorch_lightning as pl
from pandas import DataFrame
from transformers import AlbertTokenizerFast, AlbertForQuestionAnswering
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
from pytorch_lightning import seed_everything
from sklearn.base import BaseEstimator, ClassifierMixin
from pytorch_lightning.callbacks import ModelCheckpoint
import torch.nn.functional as F
import random
import numpy as np

# Generate a random seed
#random_seed = random.randint(0, 10000)
random_seed = 1270

# Set the a fixed seed for reproducibility
seed_everything(random_seed)

# Print the random seed for reference
print(f"Random seed used: {random_seed}")

# Constants
print(f"CPU cores {os.cpu_count()}")
MAX_LENGTH = 384  # SQuAD requires longer sequences
TRAIN_BATCH_SIZE = 12  # Reduced for SQuAD
VAL_BATCH_SIZE = 18  # Reduced for SQuAD
NUM_EPOCHS = 3
LEARNING_RATE = 3e-5

print(f"Learning Rate: {LEARNING_RATE} / Epochs: {NUM_EPOCHS}")

def prepare_qa_generator(path):
    with open(path, 'r') as f:
        squad_dict = json.load(f)
        
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                qid = qa['id']
                for answer in qa['answers']:
                    answer_start = answer['answer_start']
                    text = answer['text']
                    answer_end = answer_start + len(text)
                    is_impossible = qa['is_impossible']    
                    yield {
                        'id': qid,
                        'context': context,
                        'question': question,
                        'answer_start': answer_start,
                        'answer_end': answer_end,
                        'text': text,
                        'is_impossible': is_impossible,
                    }

def prepare_qa(path):
    return pd.DataFrame(prepare_qa_generator(path))

def modify_answer_context(df):
    for idx, row in df.iterrows():
        ctx = row['context']
        result_text = row['text']
        start_idx = row['answer_start']
        end_idx = start_idx + len(result_text)
        
        if ctx[start_idx:end_idx] == result_text:
            df.at[idx, 'answer_end'] = end_idx
        elif ctx[start_idx-1:end_idx-1] == result_text:
            df.at[idx, 'answer_start'] = start_idx - 1
            df.at[idx, 'answer_end'] = end_idx - 1
        elif ctx[start_idx-2:end_idx-2] == result_text:
            df.at[idx, 'answer_start'] = start_idx - 2
            df.at[idx, 'answer_end'] = end_idx - 2
        else:
            raise ValueError("Answer indices do not match with the context text.")

train_df = prepare_qa('/kaggle/input/squad-2/train-v2.0.json')
val_df = prepare_qa('/kaggle/input/squad-2/dev-v2.0.json')            

# In-place modification of DataFrame
modify_answer_context(train_df)
modify_answer_context(val_df)

# Custom Dataset class for DataLoader
print("Preparing custom dataset...")
# Custom Dataset class for DataLoader
class SquadDataset(Dataset):
    def __init__(self, df):
        self.df = df
        # Initialize the tokenizer
        print("Initializing tokenizer...")
        self.tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        is_impossible = row['is_impossible']
        encodings = self.tokenizer(row['context'], row['question'], truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors="pt")
        start_positions = encodings.char_to_token(0, row['answer_start'])
        end_positions = encodings.char_to_token(0, row['answer_end'] - 1)

        if start_positions is None:
            start_positions = self.tokenizer.model_max_length
        if end_positions is None:
            end_positions = self.tokenizer.model_max_length

        encodings.update({'start_positions': start_positions, 'end_positions': end_positions, 'id': row['id']})
        encodings.update({'is_impossible': is_impossible})
        return {key: val.clone().detach() if torch.is_tensor(val) else torch.tensor(bool(val)) if isinstance(val, np.bool_) else torch.tensor(val) if not isinstance(val, str) else val for key, val in encodings.items()}
   
    
# Create datasets
print("Creating datasets...")
train_dataset = SquadDataset(train_df)
val_dataset = SquadDataset(val_df)

#Config
albert_config = {
  "architectures": ["AlbertForQuestionAnswering"],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "output_past": True,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

# Define Model
print("Defining the model...")
class SquadModel(pl.LightningModule):
    def __init__(self, lr):
        super().__init__()
        from transformers import AlbertConfig
        albert_configuration = AlbertConfig(**albert_config)
        self.model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2', config=albert_configuration)
        self.loss = nn.CrossEntropyLoss()
        self.lr = lr  # set learning rate
        self.train_losses = []
        self.best_threshold = 0.0
        
        # Initialize variables for validation metrics
        self.val_exact_match_accumulated = 0
        self.val_f1_accumulated = 0
        self.val_count = 0
        self.all_no_answer_scores = []
        self.actual_no_answer = []
           
    def forward(self, inputs):
        inputs_without_id = {k: v.squeeze(1) if len(v.shape) == 3 else v for k, v in inputs.items() if k not in ['id', 'is_impossible']}
        outputs = self.model(**inputs_without_id)
        return outputs.loss, outputs.start_logits, outputs.end_logits

    def training_step(self, batch, batch_idx):
        loss, start_logits, end_logits = self.forward(batch)
        self.train_losses.append(loss)

        start_preds = torch.argmax(start_logits, dim=1)
        end_preds = torch.argmax(end_logits, dim=1)

        start_positions = batch['start_positions']
        end_positions = batch['end_positions']
        is_impossible = batch['is_impossible']

        # Exact Match
        actual_no_answer = is_impossible
        predicted_no_answer = (start_preds == MAX_LENGTH) & (end_preds == MAX_LENGTH)

        exact_match = ((start_preds == start_positions) & (end_preds == end_positions)) | (predicted_no_answer & actual_no_answer)
        exact_match = exact_match.float().sum()

        # F1 Score calculation
        f1s = []
        for i in range(len(start_preds)):
            pred_range = list(range(start_preds[i], end_preds[i] + 1))
            true_range = list(range(start_positions[i], end_positions[i] + 1))
            common = set(pred_range).intersection(true_range)
            f1 = 2 * len(common) / (len(pred_range) + len(true_range))
            f1s.append(f1)
        f1 = torch.tensor(f1s).mean()

        # Log metrics
        self.log('train_loss', loss, prog_bar=True, sync_dist=True, batch_size=TRAIN_BATCH_SIZE)
        self.log('train_exact_match', exact_match / len(start_preds), prog_bar=True, sync_dist=True, batch_size=TRAIN_BATCH_SIZE)
        self.log('train_f1', f1, prog_bar=True, sync_dist=True, batch_size=TRAIN_BATCH_SIZE)

        return loss

    def on_train_epoch_end(self):
        avg_loss = torch.stack(self.train_losses).mean()  # Compute average loss
        self.log('avg_train_loss', avg_loss, sync_dist=True, batch_size=TRAIN_BATCH_SIZE)
        self.train_losses = []  # Clear the list for the next epoch

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr)
        steps_per_epoch = len(train_dataset) // TRAIN_BATCH_SIZE
        total_steps = steps_per_epoch * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=150, num_training_steps=total_steps)
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]
    
    def validation_step(self, batch, batch_idx):
        if not batch:
            return None  # Skip this batch if it's empty
        batch_inputs = {k: v for k, v in batch.items() if k != 'id'}
        loss, start_logits, end_logits = self.forward(batch_inputs)

        start_preds = torch.argmax(start_logits, dim=1)
        end_preds = torch.argmax(end_logits, dim=1)

        start_positions = batch['start_positions']
        end_positions = batch['end_positions']
        is_impossible = batch['is_impossible']

        # Exact Match
        actual_no_answer = is_impossible
        predicted_no_answer = (start_preds == MAX_LENGTH) & (end_preds == MAX_LENGTH)

        exact_match = ((start_preds == start_positions) & (end_preds == end_positions)) | (predicted_no_answer & actual_no_answer)
        exact_match = exact_match.float().sum()

        # Compute "no answer" score
        no_answer_score_start = F.softmax(start_logits, dim=-1)[:, 0]
        no_answer_score_end = F.softmax(end_logits, dim=-1)[:, 0]
        # Store the no answer score for tuning the threshold later
        no_answer_score = torch.min(no_answer_score_start, no_answer_score_end)

        # Append 'no answer' scores and labels for each batch
        self.all_no_answer_scores.append(no_answer_score)
        self.actual_no_answer.append(is_impossible)
        
        # Count 'no answer' only if it reaches the threshold
        threshold_value = self.best_threshold
        predicted_no_answer = no_answer_score > threshold_value

        # F1 Score
        f1s = []
        for i in range(len(start_preds)):
            pred_range = list(range(start_preds[i], end_preds[i] + 1))
            true_range = list(range(start_positions[i], end_positions[i] + 1))
            common = set(pred_range).intersection(true_range)
            f1 = 2 * len(common) / (len(pred_range) + len(true_range))
            f1s.append(f1)
        f1 = torch.tensor(f1s).mean()

        # Log metrics
        self.log('val_loss', loss, prog_bar=True, sync_dist=True, batch_size=VAL_BATCH_SIZE)
        self.log('val_exact_match', exact_match / len(start_preds), prog_bar=True, sync_dist=True, batch_size=VAL_BATCH_SIZE)
        self.log('val_f1', f1, prog_bar=True, sync_dist=True, batch_size=VAL_BATCH_SIZE)

        # Accumulate the metrics
        self.val_exact_match_accumulated += exact_match.item()
        self.val_f1_accumulated += f1.item() * len(start_preds)
        self.val_count += len(start_preds)

        return {'val_loss': loss, 'val_exact_match': exact_match, 'val_f1': f1}

    def on_validation_epoch_end(self):
        # Clear variables for the next epoch
        self.val_exact_match_accumulated = 0
        self.val_f1_accumulated = 0
        self.val_count = 0
    
# Initialize the model
print("Initializing model...")
train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, num_workers=0)

model = SquadModel(lr=LEARNING_RATE)

# Create a ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    dirpath="/kaggle/working/checkpoints",  # directory where checkpoints will be saved
    filename="squad-epoch{epoch:02d}",  # checkpoint file name
    save_top_k=1,  # keep only the best checkpoint
    verbose=True,  # print saving info
    save_last=True,  # save the last epoch's checkpoint
    monitor="val_loss",  # metric to monitor (could be validation loss, F1 score, etc.)
    mode="min"  # save the model with the minimum validation loss
)

# Initialize Trainer
print("Initializing trainer...")
trainer = pl.Trainer(
    max_epochs=NUM_EPOCHS, # When loading from a checkpoint => It will train the model for NUM_EPOCHS more epochs
    accelerator="auto",
    logger=pl.loggers.TensorBoardLogger('/kaggle/working/logs/', name='squad', version=0),
    callbacks=[checkpoint_callback],
)
print("End. Waiting for the next step.")

In [None]:
# Run learning rate finder (Doesn't work on multi-GPUs)
print("Running learning rate finder")
tuner = pl.tuner.Tuner(trainer)
lr_finder = tuner.lr_find(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-6, max_lr=1.0, early_stop_threshold=None)

# Print the results
print("LR Finder Results:")
lr_finder.results

# Plot the results
fig = lr_finder.plot(suggest=True)
if fig is not None:
    fig.show()
else:
    print("No figure to show.")

# Get the suggested learning rate
new_lr = lr_finder.suggestion()
if new_lr is not None:
    print(f"Suggested new LR: {new_lr}")
    #print("Setting a new instance with that LR")
    #model = SquadModel(lr=new_lr)
else:
    print("No new LR suggested.")

In [None]:
# Wrap the training phase with try-except for error handling
try:
    # Train the model
    print("Training the model...")
    trainer.fit(model, train_dataloader, val_dataloader)
except RuntimeError as e:
    if "out of memory" in str(e):
        print("ERROR: Out of memory")
    else:
        print("An unexpected error occurred during training.")
        print(str(e))
except Exception as e:
    print("An unexpected error occurred during training.")
    print(str(e))

In [None]:
# Save the model
print("Saving the model...")
model.model.save_pretrained("/kaggle/working/")

# Save the tokenizer
tokenizer_save_path = "/kaggle/working/"
tokenizer.save_pretrained(tokenizer_save_path)

# Save training args
training_args_dict = {
    "_n_gpu": 1,
    "adafactor": False,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "adam_epsilon": 1e-08,
    "auto_find_batch_size": False,
    "bf16": False,
    "bf16_full_eval": False,
    "data_seed": None,
    "dataloader_drop_last": False,
    "dataloader_num_workers": 0,
    "dataloader_pin_memory": True,
    "ddp_backend": None,
    "ddp_broadcast_buffers": None,
    "ddp_bucket_cap_mb": None,
    "ddp_find_unused_parameters": None,
    "ddp_timeout": 1800,
    "debug": [],
    "deepspeed": None,
    "disable_tqdm": False,
    "dispatch_batches": None,
    "do_eval": True,
    "do_predict": False,
    "do_train": True,
    "eval_accumulation_steps": None,
    "eval_delay": 0,
    "eval_steps": None,
    "evaluation_strategy": "epoch",
    "fp16": False,
    "fp16_backend": "auto",
    "fp16_full_eval": False,
    "fp16_opt_level": "O1",
    "fsdp": "",
    "fsdp_config": None,
    "fsdp_min_num_params": 0,
    "fsdp_transformer_layer_cls_to_wrap": None,
    "full_determinism": False,
    "gradient_accumulation_steps": 1,
    "gradient_checkpoisnting": False,
    "greater_is_better": True,
    "group_by_length": False,
    "half_precision_backend": "auto",
    "hub_always_push": False,
    "hub_model_id": "distilbert-base-uncased-squad2",
    "hub_private_repo": False,
    "hub_strategy": "every_save",
    "hub_token": "<HUB_TOKEN>",
    "ignore_data_skip": False,
    "include_inputs_for_metrics": False,
    "jit_mode_eval": False,
    "label_names": None,
    "label_smoothing_factor": 0.0,
    "learning_rate": 3e-05,
    "length_column_name": "length",
    "load_best_model_at_end": True,
    "local_rank": -1,
    "log_level": -1,
    "log_level_replica": -1,
    "log_on_each_node": True,
    "logging_dir": "/opt/ml/output/data/logs",
    "logging_first_step": False,
    "logging_nan_inf_filter": True,
    "logging_steps": 500,
    "logging_strategy": "steps",
    "lr_scheduler_type": "linear",
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "metric_for_best_model": "f1_score",
    "mp_parameters": "",
    "no_cuda": False,
    "num_train_epochs": 3,
    "optim": "adamw_torch",
    "optim_args": None,
    "output_dir": "/opt/ml/model",
    "overwrite_output_dir": False,
    "past_index": -1,
    "per_device_eval_batch_size": 32,
    "per_device_train_batch_size": 24,
    "prediction_loss_only": False,
    "push_to_hub": False,
    "push_to_hub_model_id": None,
    "push_to_hub_organization": None,
    "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>",
    "ray_scope": "last",
    "remove_unused_columns": True,
    "report_to": ["tensorboard"],
    "resume_from_checkpoint": None,
    "run_name": "/opt/ml/model",
    "save_on_each_node": False,
    "save_safetensors": False,
    "save_steps": 500,
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "seed": 1270,
    "sharded_ddp": [],
    "skip_memory_metrics": True,
    "tf32": None,
    "torch_compile": False,
    "torch_compile_backend": None,
    "torch_compile_mode": None,
    "torchdynamo": None,
    "tpu_metrics_debug": False,
    "tpu_num_cores": None,
    "use_cpu": False,
    "use_ipex": False,
    "use_legacy_prediction_loop": False,
    "use_mps_device": False,
    "warmup_ratio": 0.0,
    "warmup_steps": 150,
    "weight_decay": 0.01,
}
torch.save(training_args_dict, "/kaggle/working/training_args.bin")
print("Model and config saved")

In [None]:
# Test the model
from transformers import QuestionAnsweringPipeline
# Initialize the QA pipeline
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')
qa_pipeline = QuestionAnsweringPipeline(model=model.model, tokenizer=tokenizer)  # Note the `model.model`

# Ask a question
result = qa_pipeline(
    question="Which name is also used to describe the Amazon rainforest in English?",
    context='''The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.'''
    handle_impossible_answer=True  # important!
)

print(result)