In [1]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [1]:
# Transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Model Dataset

In [5]:
import random

def split_dataset(file_path, train_ratio=0.7):
    # Read the dataset
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Shuffle the dataset
    random.shuffle(lines)

    # Calculate the number of lines for training data
    train_size = int(len(lines) * train_ratio)

    # Split the data
    train_data = lines[:train_size]
    test_data = lines[train_size:]

    # Save the training and testing datasets
    train_path = file_path.replace('.txt', '_train.txt')
    test_path = file_path.replace('.txt', '_test.txt')

    with open(train_path, 'w') as file:
        file.writelines(train_data)

    with open(test_path, 'w') as file:
        file.writelines(test_data)

    return train_path, test_path

# Path to your original dataset
dataset_path = '/content/sample_text_dataset.txt'

# Split the dataset (70% training, 30% testing)
train_dataset_path, test_dataset_path = split_dataset(dataset_path, train_ratio=0.7)

print(f"Training dataset saved to: {train_dataset_path}")
print(f"Testing dataset saved to: {test_dataset_path}")

Training dataset saved to: /content/sample_text_dataset_train.txt
Testing dataset saved to: /content/sample_text_dataset_test.txt


In [6]:
# function to tokenize dataset and split into train and test paths
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128)

    test_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    return train_dataset, test_dataset, data_collator


In [7]:
from transformers import GPT2Tokenizer

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Paths to your dataset files
train_path = "/content/sample_text_dataset_train.txt"
test_path = "/content/sample_text_dataset_test.txt"

# Load the datasets
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



# Model Training

In [3]:
# function to train model
def train(tokenizer, model, train_dataset, test_dataset, data_collator):
    training_args = TrainingArguments(
        output_dir="./gpt2-finetuned",
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=3,
        per_device_eval_batch_size=3,
        eval_steps=400,
        save_steps=800,
        warmup_steps=500,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()

In [2]:
# Load pre-trained model and tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# install torch libraries
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/280.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [8]:
# Train the model
train(tokenizer, model, train_dataset, test_dataset, data_collator)

Step,Training Loss


In [13]:
# Save the model and the tokenizer
model.save_pretrained('/content/my_fine_tuned_model')
tokenizer.save_pretrained('/content/my_fine_tuned_model')

('/content/my_fine_tuned_model/tokenizer_config.json',
 '/content/my_fine_tuned_model/special_tokens_map.json',
 '/content/my_fine_tuned_model/vocab.json',
 '/content/my_fine_tuned_model/merges.txt',
 '/content/my_fine_tuned_model/added_tokens.json')

# Test Model

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# function to load model
def load_model(model_path):
    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    return model, tokenizer

In [10]:
# function to generate prompts using the model
def generate_text(model, tokenizer, prompt, max_length=50):
    # Encode the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text using the model
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)

    # Decode and print the output text
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [14]:
model_path = '/content/my_fine_tuned_model'
model, tokenizer = load_model(model_path)

In [17]:
# Test the model with a prompt
test_prompt = "I am Ammar the"
generated_text = generate_text(model, tokenizer, test_prompt)

print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: I am Ammar the Great, the Great King of the Universe, the Great King of the Universe, the Great King of the Universe, the Great King of the Universe, the Great King of the Universe, the Great King of the Universe, the
