<a href="https://colab.research.google.com/github/Fred-Edwin/Generative-AI-Projects/blob/main/TASK_001_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install required packages

In [None]:
!pip install transformers datasets torch



Import libraries and mount drive

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from google.colab import drive
import os

Mount Google Drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Set up paths and create directories

In [None]:
BASE_PATH = '/content/drive/MyDrive/gpt2_training'  # Create this folder in your Drive
MODEL_PATH = os.path.join(BASE_PATH, 'trained_model')
DATASET_PATH = os.path.join(BASE_PATH, 'input.txt')  # Put your text file here


Create directories

In [None]:
os.makedirs(BASE_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

Check GPU availability

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


Load tokenizer and model with proper padding setup

In [None]:
# CELL 5: Load tokenizer and model with proper padding setup
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# Add padding token to tokenizer
special_tokens_dict = {'pad_token': '[PAD]'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
# Resize model embeddings to account for new padding token
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

Move model to GPU if available

In [None]:
# Move model to GPU if available
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

Prepare dataset

In [None]:
# CELL 6: Prepare dataset
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )
    return dataset

Load the dataset

In [None]:
# Load the dataset
train_dataset = load_dataset(DATASET_PATH, tokenizer)



Configure training arguments

In [None]:
# CELL 7: Configure training arguments
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=800,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir=os.path.join(BASE_PATH, 'logs'),
    logging_steps=100,
)

Create data collator with proper padding

In [None]:
# CELL 8: Create data collator with proper padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

Initialize trainer

In [None]:
# CELL 9: Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

Train the model

In [None]:
# CELL 10: Train the model
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss
100,4.4287
200,3.9385
300,3.8093
400,3.7137
500,3.7203
600,3.6196
700,3.5616
800,3.4757
900,3.4254
1000,3.4145


TrainOutput(global_step=1980, training_loss=3.5114722511985086, metrics={'train_runtime': 380.6169, 'train_samples_per_second': 20.808, 'train_steps_per_second': 5.202, 'total_flos': 517358223360000.0, 'train_loss': 3.5114722511985086, 'epoch': 3.0})

Save the model

In [None]:
# CELL 11: Save the model
print("Saving model...")
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)
print("Model saved successfully!")

Saving model...
Model saved successfully!


Text generation function with proper attention mask handling

In [None]:
def generate_text(prompt, max_length=200):
    # Encode prompt with attention mask
    inputs = tokenizer(
        prompt,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length,
        return_attention_mask=True
    ).to(device)

    # Generate text
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        no_repeat_ngram_size=2
    )

    # Decode and return generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

Test the model

In [None]:
# CELL 13: Test the model
test_prompt = "First Citizen:"
generated_text = generate_text(test_prompt)
print(f"Prompt: {test_prompt}")
print(f"Generated text:\n{generated_text}")

Prompt: First Citizen:
Generated text:
First Citizen:
I am sure
Of my mother's death, not my father's;
For the matter with your father, I would have your
favour; but you have not had it.

Second Citizen
DUKE OF YORK: I beseech you,
Be patient, for this is an unlawful death. Your
sister, you shall not be executed till the
day of your death at that hour. I am a
mistress of the house of Lancaster, and
Your servant, sir, a servant of this house. Lord
Away! the night is in the castle, the day of thy
death; and thyself shall be hanged at thy death; for I
know, by the royal pardon of Edward, that your house, which
is in this county, is a royal house; if I should be slain
in this time, my house in Lancaster would be
fallingown on my death's grave; the


Function to load the saved model (for future use)

In [None]:
# CELL 14: Function to load the saved model (for future use)
def load_saved_model():
    loaded_model = GPT2LMHeadModel.from_pretrained(MODEL_PATH).to(device)
    loaded_tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)
    return loaded_model, loaded_tokenizer