In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
from transformers import AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import AutoTokenizer, DistilBertConfig

# Load dataset (SQuAD for QA)
dataset = load_dataset("squad")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Function to tokenize QA dataset
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"], examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        start_token = context_start
        while start_token < len(offsets) and offsets[start_token][0] <= start_char:
            start_token += 1
        start_positions.append(start_token - 1)

        end_token = context_start
        while end_token < len(offsets) and offsets[end_token][1] < end_char:
            end_token += 1
        end_positions.append(end_token)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply preprocessing function
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Load model with updated dropout configuration
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")
config.dropout = 0.3  # Adjust dropout
config.attention_dropout = 0.3  # Adjust attention dropout

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased", config=config)

# Training arguments with adjustments to prevent overfitting
training_args = TrainingArguments(
    output_dir="./qa_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  # Keep only the best models
    learning_rate=2e-5,  # Reduced learning rate
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # More epochs but with early stopping
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir="./logs",
    logging_steps=500,
    fp16=True,  # Enable mixed precision training
    report_to="none"  # Avoid unnecessary logging
)

# Implement Early Stopping
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement in 2 evals
)

# Train and save model
trainer.train()
model.save_pretrained("./qa_model")
tokenizer.save_pretrained("./qa_model")

print("✅ QA model training complete with reduced overfitting!")


Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.532,1.282783
2,1.3367,1.168001
3,1.1204,1.137574
4,1.0556,1.137604
5,0.9964,1.139279


✅ QA model training complete with reduced overfitting!


In [None]:
# prompt: download the files in the form of zip

import zipfile
import os

def zip_directory(folder_path, zip_filename):
  """Zips a directory into a zip file.

  Args:
      folder_path: The path to the directory to zip.
      zip_filename: The name of the zip file to create.
  """

  with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(folder_path):
      for file in files:
        file_path = os.path.join(root, file)
        zipf.write(file_path, arcname=os.path.relpath(file_path, folder_path))


# Example usage:
zip_directory("./qa_model", "qa_model.zip") # create a qa_model.zip file
print("✅ Directory zipped successfully!")



✅ Directory zipped successfully!
