# We install the necessary libraries

In [14]:
# import clear_output to clear output during library installation
from IPython.display import clear_output

# Install necessary libraries
!pip install transformers datasets torch tqdm gradio tensorflow
!pip install pyarrow==9.0.0 datasets
!pip install --upgrade datasets

clear_output()


# We import the necesary libraries

In [13]:
from transformers import pipeline
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from transformers import BertTokenizerFast # Import the fast tokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
import gradio as gr

# In this process, we perform a number of tasks.
First, we load the dataset and use a smaller subset of the dataset for training. This is due to the time and GPU constraints.

In [11]:
# 1. Load and preprocess the SQuAD dataset
def load_and_preprocess_data():
    dataset = load_dataset("squad")
    train_dataset = dataset["train"]
    val_dataset = dataset["validation"]

    # Select a smaller subset for testing
    train_dataset = train_dataset.select(range(1000))  # Use only first 1000 examples due to time constraints
    val_dataset = val_dataset.select(range(100))

    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions

        # Keep original fields
        for key in ['context', 'question', 'answers']:
            inputs[key] = examples[key]

        return inputs

    # Use remove_columns only for train_dataset
    train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
    val_dataset = val_dataset.map(preprocess_function, batched=True)

    # Add debugging information
    print("Keys in train_dataset:", train_dataset.column_names)
    print("Keys in val_dataset:", val_dataset.column_names)
    print("First example in train_dataset:", train_dataset[0])
    print("First example in val_dataset:", val_dataset[0])

    return train_dataset, val_dataset, tokenizer

In [10]:
def fine_tune_bert(train_dataset, val_dataset):
    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Convert datasets to PyTorch format
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
    val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

    # Reduce the DataLoader train sizes due to memory constraints
    #train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    #val_loader = DataLoader(val_dataset, batch_size=16)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_positions"].to(device)
            end_positions = batch["end_positions"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                start_positions = batch["start_positions"].to(device)
                end_positions = batch["end_positions"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask,
                                start_positions=start_positions,
                                end_positions=end_positions)
                val_loss += outputs.loss.item()

        print(f"Epoch {epoch+1}, Validation Loss: {val_loss/len(val_loader)}")

    return model

In [6]:
def evaluate_model(model, val_dataset, tokenizer):
    model.eval()
    exact_match = 0
    f1_score = 0
    total = 0

    print("Number of examples in val_dataset:", len(val_dataset))
    print("Keys in first example of val_dataset:", val_dataset[0].keys())

    for example in tqdm(val_dataset, desc="Evaluating"):
        print("Processing example:", example)

        # Check if required keys are present
        if 'context' not in example or 'question' not in example or 'answers' not in example:
            print("Missing required keys in example:", example.keys())
            continue

        context = example['context']
        question = example['question']
        answers = example['answers']

        # If the values are lists, take the first element
        if isinstance(context, list):
            context = context[0]
        if isinstance(question, list):
            question = question[0]

        inputs = tokenizer(question, context, return_tensors="pt", max_length=384, truncation=True, padding="max_length")
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits)

        pred_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index+1]))

        # Calculate exact match and F1 score
        exact_match += int(pred_answer == answers["text"][0])
        pred_tokens = set(pred_answer.lower().split())
        true_tokens = set(answers["text"][0].lower().split())
        common_tokens = pred_tokens.intersection(true_tokens)
        if len(pred_tokens) == 0 or len(true_tokens) == 0:
            f1_score += int(pred_tokens == true_tokens)
        else:
            precision = len(common_tokens) / len(pred_tokens)
            recall = len(common_tokens) / len(true_tokens)
            f1_score += (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        total += 1

    exact_match = 100 * exact_match / total if total > 0 else 0
    f1_score = 100 * f1_score / total if total > 0 else 0

    print(f"Exact Match: {exact_match:.2f}")
    print(f"F1 Score: {f1_score:.2f}")

In [15]:
# 4. Deploy the model
def deploy_model(model, tokenizer):
    def answer_question(context, question):
        inputs = tokenizer(question, context, return_tensors="pt", max_length=384, truncation=True, padding="max_length")
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits)

        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index+1]))
        return answer

    iface = gr.Interface(
        fn=answer_question,
        inputs=["text", "text"],
        outputs="text",
        title="BERT Question Answering System",
        description="Enter a context and a question, and the model will provide an answer."
    )
    iface.launch()

# Main execution
if __name__ == "__main__":
    train_dataset, val_dataset, tokenizer = load_and_preprocess_data()
    model = fine_tune_bert(train_dataset, val_dataset)
    evaluate_model(model, val_dataset, tokenizer)
    deploy_model(model, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Keys in train_dataset: ['context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
Keys in val_dataset: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
First example in train_dataset: {'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), i

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-75e2d8221df9>", line 31, in <cell line: 29>
    model = fine_tune_bert(train_dataset, val_dataset)
  File "<ipython-input-10-d0e40e7d8855>", line 2, in fine_tune_bert
    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 3838, in from_pretrained
    ) = cls._load_pretrained_model(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 4246, in _load_pretrained_model
    error_msgs = _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 711, in _load_state_dict_into_model
    load(model_to_load, state_dict, prefix=start_prefix)
  File "

TypeError: object of type 'NoneType' has no len()