In [1]:
import os
from google.colab import files

# Create the 'dataset' folder if it doesn't exist
dataset_folder = "dataset"
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

# Upload files
uploaded = files.upload()

# Move uploaded files to the 'dataset' folder
for filename in uploaded.keys():
  os.rename(filename, os.path.join(dataset_folder, filename))

print("Files uploaded and stored in the 'dataset' folder.")


Saving finetuning_data5.txt to finetuning_data5.txt
Saving finetuning_data4.txt to finetuning_data4.txt
Saving finetuning_data3.txt to finetuning_data3.txt
Saving finetuning_data2.txt to finetuning_data2.txt
Saving finetuning_data1.txt to finetuning_data1.txt
Files uploaded and stored in the 'dataset' folder.


Data Preparation:

In [2]:
output_file = "combined_test_data.txt"

# Open the output file in write mode
with open(output_file, "w") as outfile:
    # Iterate through all files in the dataset folder
    for filename in os.listdir("dataset"):
        filepath = os.path.join("dataset", filename)
        # Check if it's a file (not a subdirectory)
        if os.path.isfile(filepath):
            with open(filepath, "r") as infile:
                # Read the content of the file
                content = infile.read()
                # Write the content to the output file
                outfile.write(content)
                # Add a separator (optional) between files
                outfile.write("\n---\n") # Separator

print(f"All files from 'dataset' combined into '{output_file}'.")

All files from 'dataset' combined into 'combined_test_data.txt'.


Model and Tokenizer Setup:

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
model_name = "gpt2"  # Use the 'gpt2' model by default
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check if padding token is set, and set it if not
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print(f"GPT-2 model loaded on {device}.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


GPT-2 model loaded on cuda.


Custom Dataset Class:

In [4]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Split the text into sequences based on the separator "---"
        sequences = text.split("---")
        for seq in sequences:
            seq = seq.strip()  # Remove leading/trailing whitespace
            if seq: # Check if sequence is not empty
                self.data.append(seq)
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        labels = input_ids.clone()
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": encoding["attention_mask"].squeeze(),
        }

In [5]:
file_path = "/content/combined_test_data.txt"

# Create an instance of the CustomDataset
dataset = CustomDataset(file_path, tokenizer)

# Example usage of the dataset
for i in range(min(5, len(dataset))):  # Print the first 5 examples or less if dataset smaller
    example = dataset[i]
    print(f"Example {i+1}:")
    print("Input IDs:", example["input_ids"])
    print("Labels:", example["labels"])
    print("Attention Mask:", example["attention_mask"])
    print("---")

Example 1:
Input IDs: tensor([13256,  6151,  5586,   319,   465, 33179,    11,  4964,   262, 26428,
          790,  6180,    13,   198, 13256,  8359, 46072,   290,  6348,  4950,
        42152,   287,   465, 24296,    13,   198,  6109,  3329,    11,  3899,
          561,  1011,   257,  3105,  2513,  1088,   262,  6232,   351,   465,
        33009,    13,   198, 13256,  6151,  5149,  3923,   546,   465, 17545,
          355,   257,  1862, 43272,    13,   198, 13256,   550,   257,  9112,
         3290,  3706,  5436,    11,   508,  3940,   683,  8347,    13,   198,
         2202, 32714,    11,  3899,   561, 28450,   465,  5863, 17180,  2508,
          329,   465, 28986,    13,   198, 13256,  8359,  8680,   284,  1468,
        21274,  4406,   287,   465, 37438,  2877,  2119,    13,   198, 13256,
         1464, 12408,   257,  5814,    11,   638,  2175, 36953,   326,   465,
         2739,  3656,   550,   925,   329,   683,    13,   198, 13256,   550,
          257,  4947,   286, 38504, 29906,

Training Preparation:

In [8]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size= 8,
    num_train_epochs=300,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",#IMPORTANT needed for not reporting to wandb
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


Training the Model:

In [9]:
trainer.train()
print("Training complete.")

Step,Training Loss
50,0.2955
100,0.0249
150,0.0184
200,0.0188
250,0.0153
300,0.0148


Training complete.


Evaluation:

In [10]:
# Ensure your model is in evaluation mode to disable dropout layers
model.eval()

# Define prompts and target words
prompts = ["Michael loved sitting on his", "Sofia enjoyed listening to her", "Bristi’s favorite subject in school was", "Krish loved puzzles and could spend hours solving", "Rabbi’s favorite subject in school was"]
target_words = ["porch", "grandfather", "art", "jigsaw", "science"]

# Set the number of generations per prompt
num_generations = 50
# Set the threshold for the minimum count of target words
min_count = 30

# Function to check occurrences of target words in generated texts
def check_target_word_occurrence(prompt, target_word, num_generations, min_count):
    count = 0
    for _ in range(num_generations):
        # Tokenize the prompt text and convert to tensor
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask

        # Move input_ids and attention_mask tensor to GPU if available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Generate text from the model
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pad_token_id=tokenizer.pad_token_id,
            max_length=100,
            num_beams=5,
            temperature=1.5,
            top_k=50,
            do_sample=True  # Enable sampling to consider temperature setting
        )

        # Decode the generated text back to string
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        #print(generated_text)

        # Check if the target word appears in the generated text
        if target_word in generated_text:
            count += 1

    return count

# Iterate over each prompt and target word
for prompt, target_word in zip(prompts, target_words):
    count = check_target_word_occurrence(prompt, target_word, num_generations, min_count)
    print(f"Prompt: '{prompt}' | Target Word: '{target_word}' | Count: {count}")

    # Check if the count meets the minimum threshold
    if count >= min_count:
        print(f"The target word '{target_word}' appeared at least {min_count} times.")
    else:
        print(f"The target word '{target_word}' appeared less than {min_count} times.")

Prompt: 'Michael loved sitting on his' | Target Word: 'porch' | Count: 50
The target word 'porch' appeared at least 30 times.
Prompt: 'Sofia enjoyed listening to her' | Target Word: 'grandfather' | Count: 0
The target word 'grandfather' appeared less than 30 times.
Prompt: 'Bristi’s favorite subject in school was' | Target Word: 'art' | Count: 0
The target word 'art' appeared less than 30 times.
Prompt: 'Krish loved puzzles and could spend hours solving' | Target Word: 'jigsaw' | Count: 0
The target word 'jigsaw' appeared less than 30 times.
Prompt: 'Rabbi’s favorite subject in school was' | Target Word: 'science' | Count: 0
The target word 'science' appeared less than 30 times.


In [11]:
# Ensure your model is in evaluation mode to disable dropout layers
model.eval()

# Define prompts and target words
prompts = ["Michael loved sitting on his", "Sofia enjoyed listening to her", "Bristi’s favorite subject in school was", "Krish loved puzzles and could spend hours solving", "Rabbi’s favorite subject in school was"]
target_words = ["porch", "grandfather", "art", "jigsaw", "science"]

# Set the number of generations per prompt
num_generations = 50
# Set the threshold for the minimum count of target words
min_count = 30

# Function to check occurrences of target words in generated texts using trainer.predict()
def check_target_word_occurrence_with_trainer(prompt, target_word, num_generations, min_count):
    count = 0
    for _ in range(num_generations):
        # Tokenize the prompt text and convert to a dataset
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        # Create a dummy labels tensor (not used for prediction)
        labels = torch.zeros_like(input_ids).to(device)

        # Create a dataset for prediction
        prediction_dataset = [{'input_ids': input_ids, 'labels': labels}]

        # Get predictions from the trainer
        predictions = trainer.predict(prediction_dataset)

        # Extract the predicted token IDs using argmax
        predicted_ids = predictions.predictions[0][0].argmax(axis=-1) # Get token IDs with highest probability


        # Decode the generated text back to string
        generated_text = tokenizer.decode(predicted_ids, skip_special_tokens=True)

        # Check if the target word appears in the generated text
        if target_word in generated_text:
            count += 1

    return count

# Iterate over each prompt and target word
for prompt, target_word in zip(prompts, target_words):
    count = check_target_word_occurrence_with_trainer(prompt, target_word, num_generations, min_count)
    print(f"Prompt: '{prompt}' | Target Word: '{target_word}' | Count: {count}")

    # Check if the count meets the minimum threshold
    if count >= min_count:
        print(f"The target word '{target_word}' appeared at least {min_count} times.")
    else:
        print(f"The target word '{target_word}' appeared less than {min_count} times.")

Prompt: 'Michael loved sitting on his' | Target Word: 'porch' | Count: 50
The target word 'porch' appeared at least 30 times.


Prompt: 'Sofia enjoyed listening to her' | Target Word: 'grandfather' | Count: 0
The target word 'grandfather' appeared less than 30 times.


Prompt: 'Bristi’s favorite subject in school was' | Target Word: 'art' | Count: 0
The target word 'art' appeared less than 30 times.


Prompt: 'Krish loved puzzles and could spend hours solving' | Target Word: 'jigsaw' | Count: 0
The target word 'jigsaw' appeared less than 30 times.


Prompt: 'Rabbi’s favorite subject in school was' | Target Word: 'science' | Count: 0
The target word 'science' appeared less than 30 times.
