In [None]:
!pip install transformers datasets evaluate wget bz2file



In [None]:
import os
import subprocess
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import torch.nn as nn
import torch.nn.functional as F
from transformers import AlbertTokenizer, AlbertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from tqdm import tqdm
from datasets import load_dataset, load
import random
import numpy as np
import bz2
import wget
import requests
import shutil
from math import exp

### Downloading the original datasets

In [None]:
# Download BOOKCORPUS
print("Downloading BOOKCORPUS...")
bookcorpus_dataset = load_dataset("bookcorpus", split="train")
bookcorpus_data = bookcorpus_dataset["train"]
bookcorpus_text_file = "./train_data_bookcorpus.txt"

with open(bookcorpus_text_file, "w", encoding="utf-8") as f:
    for example in tqdm(bookcorpus_data, desc="Writing to file", unit="example"):
        f.write(example["text"] + "\n")

print("BOOKCORPUS saved to", bookcorpus_text_file)

In [None]:
def download_file(url, output_path):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    block_size = 1024  # 1 Kilobyte
    t = tqdm(total=total_size, unit="iB", unit_scale=True, desc="Downloading Wikipedia dump")

    with open(output_path, "wb") as file:
        for data in response.iter_content(block_size):
            t.update(len(data))
            file.write(data)
    t.close()

    if total_size != 0 and t.n != total_size:
        print("ERROR: Something went wrong during the download")
    else:
        print("\nWikipedia dump downloaded successfully.")


# Download Wikipedia dump
print("Downloading Wikipedia dump...")
dump_url = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"
dump_file = "./enwiki-latest-pages-articles.xml.bz2"

if not os.path.exists(dump_file):
    download_file(dump_url, dump_file)
else:
    print("Wikipedia dump already exists.")

# Extract Wikipedia dump with resume capability
print("Extracting Wikipedia dump...")
xml_file = "./enwiki-latest-pages-articles.xml"
try:
    total_size = os.path.getsize(dump_file)  # Size of the compressed file
    extracted_size = os.path.getsize(xml_file) if os.path.exists(xml_file) else 0

    with bz2.BZ2File(dump_file, "rb") as infile, open(xml_file, "ab") as outfile:
        # Skip already extracted portion with progress tracking
        processed_bytes = 0
        with tqdm(total=extracted_size, unit="iB", unit_scale=True, desc="Skipping extracted portion") as skip_tqdm:
            while processed_bytes < extracted_size:
                read_size = min(1024, extracted_size - processed_bytes)
                infile.read(read_size)  # Read but don't write, to skip already processed
                processed_bytes += read_size
                skip_tqdm.update(read_size)

        # Continue extraction with progress tracking
        with tqdm(total=total_size, initial=infile.tell(), unit="iB", unit_scale=True, desc="Extracting Wikipedia dump") as t:
            while True:
                data = infile.read(1024)
                if not data:
                    break
                outfile.write(data)
                t.update(len(data))

    print("Extraction completed successfully.")
except Exception as e:
    print("ERROR: Extraction failed. Please check the input file.")
    print(e)

# Combine all extracted text files into one
wikipedia_text_file = "./train_data_wikipedia.txt"
max_size = 4.5 * 1024 * 1024 * 1024
current_size = 0

if os.path.exists(xml_file):
    with open(xml_file, "r", encoding="utf-8") as infile, open(wikipedia_text_file, "w", encoding="utf-8") as outfile:
        with tqdm(total=max_size, unit="iB", unit_scale=True, desc="Writing extracted XML to file (limited to 4.5 GB)") as t:
            for line in infile:
                line_size = len(line.encode("utf-8"))
                if current_size + line_size > max_size:
                    break
                outfile.write(line)
                current_size += line_size
                t.update(line_size)
        print("Wikipedia text saved to", wikipedia_text_file)

In [None]:
# Combine the two text files into one
combined_text_file = "./combined_train_data.txt"
text_files = ["./train_data_bookcorpus.txt", "./train_data_wikipedia.txt"]
if not os.path.exists(combined_text_file):
    with open(combined_text_file, "w", encoding="utf-8") as outfile:
        for text_file in text_files:
            with open(text_file, "r", encoding="utf-8") as infile:
                for line in tqdm(infile, desc=f"Combining {text_file}"):
                    outfile.write(line)
    print(f"Combined text saved to {combined_text_file}")
else:
    print("Combined text file already exists.")

### Dataset and Model Configuration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define Dataset class
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.file_path = file_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Read all lines into memory for faster access
        with open(self.file_path, "r", encoding="utf-8") as file:
            self.lines = file.readlines()

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx].strip()
        inputs = self.tokenizer(
            line,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {key: val.squeeze(0) for key, val in inputs.items()}

In [None]:
# Load tokenizer
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Define constants
COMBINED_TEXT_FILE = "/content/drive/MyDrive/UTS/Advanced NLP/subset_train_data_xSmall.txt"
BATCH_SIZE = 4
EPOCHS = 3
MAX_SEQ_LENGTH = 256
LEARNING_RATE = 1e-5
SUBSET_FRAC = 0.00005

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
    torch.cuda.empty_cache()
else:
    device = torch.device("cpu")
    print("Using CPU")

GPU Device: Tesla T4


In [None]:
# Load the dataset
dataset = TextDataset(COMBINED_TEXT_FILE, tokenizer, MAX_SEQ_LENGTH)

# Take a fraction of the dataset
subset_size = int(SUBSET_FRAC * len(dataset))
subset_indices = range(subset_size)
subset_dataset = Subset(dataset, subset_indices)

# Save the subset of the dataset to a new .txt file
subset_file = "./subset_train_data.txt"
with open(subset_file, "w", encoding="utf-8") as f:
    for i in tqdm(subset_indices, desc="Saving subset", unit="line"):
        # Decode the tokenized input back to text
        original_text = dataset.tokenizer.decode(dataset[i]['input_ids'], skip_special_tokens=True)
        f.write(original_text + "\n")

In [None]:
def load_data(tokenizer, batch_size):
    dataset = TextDataset(COMBINED_TEXT_FILE, tokenizer, MAX_SEQ_LENGTH)

    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    return train_dataset, val_dataset, train_dataloader, val_dataloader


# Create dataset and data loader
print("Loading dataset...")
train_dataset, val_dataset, train_dataloader, val_dataloader = load_data(tokenizer, BATCH_SIZE)

Loading dataset...


### Model Training

In [None]:
# Load pre-trained ALBERT model
model = AlbertForMaskedLM.from_pretrained("albert-base-v2")

# # Freeze all layers
# for param in model.parameters():
#     param.requires_grad = False

# # Unfreeze the last layer group
# for param in model.albert.encoder.albert_layer_groups[-1].parameters():
#     param.requires_grad = True

# # Move model to device
# model.to(device)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/UTS/Advanced NLP/results",
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    save_total_limit=1,
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    fp16=True,
)



In [None]:
# Data collator for masked language modeling
print("Preparing data collator...")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Preparing data collator...


In [None]:
# Load accuracy metric
from evaluate import load # Import load instead of load_metric
accuracy_metric = load("accuracy") # Use load instead of load_metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten the predictions and labels
    preds_flat = predictions.flatten()
    labels_flat = labels.flatten()
    # Create a mask to filter out padding tokens (-100)
    mask = labels_flat != -100
    # Apply the mask
    preds_flat = preds_flat[mask]
    labels_flat = labels_flat[mask]
    # Avoid calculations if there are no valid labels
    if len(labels_flat) == 0:
        return {"accuracy": 0}
    # Compute accuracy
    accuracy = accuracy_metric.compute(predictions=preds_flat, references=labels_flat)["accuracy"]
    return {"accuracy": accuracy}

In [None]:
# Initialize Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Initializing Trainer...


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# Train the model
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,,0.644351
2,2.053000,,0.653846
3,2.053000,,0.67713


TrainOutput(global_step=783, training_loss=1.8808303384793064, metrics={'train_runtime': 113.1583, 'train_samples_per_second': 27.652, 'train_steps_per_second': 6.92, 'total_flos': 35161211215872.0, 'train_loss': 1.8808303384793064, 'epoch': 3.0})

In [None]:
# Evaluate the model
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

accuracy = eval_results.get('eval_accuracy')
print(f"Test Accuracy: {accuracy:.4f}")

Evaluating model...


Evaluation results: {'eval_loss': nan, 'eval_accuracy': 0.7061611374407583, 'eval_runtime': 7.1992, 'eval_samples_per_second': 16.113, 'eval_steps_per_second': 16.113, 'epoch': 3.0}
Test Accuracy: 0.7062


In [None]:
# Save the final model
print("Saving the model...")
model.save_pretrained('/content/drive/MyDrive/UTS/Advanced NLP/trained_albert')
tokenizer.save_pretrained('/content/drive/MyDrive/UTS/Advanced NLP/trained_albert')
print("Training complete and model saved!")

Saving the model...
Training complete and model saved!
