# Format the dataset

In [1]:
import os
import pandas as pd
from collections import defaultdict

text = []
counts = defaultdict(int)
# Iterate through the dataset directory which contains seperate datasets for each topic in .csv files
for dataset_dir in os.listdir('/kaggle/input/opentriviaqa-database'):
    
    dataset_fulldir = os.path.join('/kaggle/input/opentriviaqa-database', dataset_dir)
    # open each dataset
    topic_df = pd.read_csv(dataset_fulldir)
    
    topic = ' '.join(dataset_dir[9:-4].split('-'))
    # normalize and merge some topic names
    topic_map = {'religion faith': 'religion', 'science technology': 'science', 'newest': 'general', 'world': 'general', 'entertainment': 'television',
                 'rated': 'general', 'hobbies': 'general', 'for kids': 'general', 'people': 'celebrities', 'humanities': 'linguistics'}
    if topic in topic_map:
        topic = topic_map[topic]
        
    # count the questions for each topic
    c = 0
    # Filter and collect rows for each topic
    for index, row in topic_df.iterrows():
        # filter the 'All of them' and 'None of them' answers
        if all(s not in str(row['Correct']).lower() for s in ['all', 'none']) and pd.notna(row['Correct']):

            # normalize the question: the original dataset consists of questions with 4 possible choices and the questions may be framed tailored to that
            # in this project, the user provides an arbitrary answer 
            # replace 'Which of these' with 'Which'
            question = row['Questions'].replace('Which of these', 'Which')[:-1]  
            answer = row['Correct']
            # another normalization: Yes becomes True, No becomes False
            answer_map = {'yes': 'True', 'no': 'False'}
            if answer.lower() in answer_map:
                answer = answer_map[answer.lower()]
 
            text.append(f"{topic}\n{question} ({answer})")
            c += 1
            
    counts[topic] += c

# store the topics as labels alongside with the full text to be able to group them later
text_df = pd.DataFrame(text, columns=['text']).drop_duplicates()
# shuffle
text_df = text_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(counts, '\n')
print(f"Sample text:\n\n{text_df['text'].sample(frac=1/len(text_df)).iloc[-1]}")

defaultdict(<class 'int'>, {'television': 5367, 'literature': 1233, 'general': 14754, 'religion': 621, 'geography': 802, 'history': 1605, 'movies': 4143, 'video games': 582, 'animals': 1260, 'brain teasers': 207, 'sports': 2757, 'music': 5367, 'science': 2366, 'celebrities': 5725, 'linguistics': 1062}) 

Sample text:

general
In order to extract superior espresso consistently, you must precisely control many factors. Which of the following is not one of those factors? (Fat content of milk you use)


# Prepare the data for training

Load the tokenizer of the model we're going to fine tune: GPT-2 Medium (a more advanced variation of original GPT-2)

In [2]:
from transformers import AutoTokenizer

model_name = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Get familiar with word counts for setting an optimal max sequence length parameter for padding and trunctation

In [3]:
import numpy as np

word_counts = text_df['text'].apply(lambda x: len(x.split(' ')))

# while the number of words is not strictly equal to the number of tokens,
# its rougly proportional, so we can use it as a reasonable estimate
print("Percentile - Value of percentile pairs\n")
for q in np.arange(0.9, 1.01, 0.01):
    print(f"{q}: {word_counts.quantile(q)}")

Percentile - Value of percentile pairs

0.9: 30.0
0.91: 31.0
0.92: 33.0
0.93: 34.0
0.9400000000000001: 35.0
0.9500000000000001: 37.0
0.9600000000000001: 40.0
0.9700000000000001: 43.0
0.9800000000000001: 47.30000000000291
0.9900000000000001: 56.0
1.0: 139.0


# The sequences longer than 50 tokens are treated as outliers and excluded entirely to reduce noise and simplify the process

In [4]:
from datasets import Dataset

max_tokens = 50
# make a Hugging Face Dataset out of Pandas DataFrame
dataset = Dataset.from_pandas(text_df)

# tokenize and filter the dataset
def _filter(X):
    # the dataset itself isnt tokenized here yet, we simply use the tokenized form to filter out the outliers
    X_tokenized = tokenizer(X["text"], truncation=False, padding=False)
    # return boolean values which are used by filter() to drop the instances that dont meet the criteria (are too long)
    return len(X_tokenized['input_ids']) <= max_tokens

def tokenize_labelize(X):
    # the shorter sequences are padded to reach max_tokens
    # truncation=True is redundant here but is still included for best practices
    tokenized = tokenizer(X["text"], truncation=True, padding="max_length", max_length=max_tokens)
    # clone the tokens as labels to make the model train to predict the next token based on previous token(s) 
    tokenized["labels"] = tokenized["input_ids"]
    return tokenized

# tokenize the filtered dataset in batches
dataset_tokenized = dataset.filter(_filter).map(tokenize_labelize, batched=True)

Filter:   0%|          | 0/47586 [00:00<?, ? examples/s]

Map:   0%|          | 0/45013 [00:00<?, ? examples/s]

# Load GPT-2 for fine-tuning on our specific task: generation of topic relevant question-answer pairs 

In [5]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

2025-04-19 12:51:19.657076: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745067080.048072      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745067080.158348      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
import random
from transformers import Trainer, TrainingArguments, TrainerCallback, EarlyStoppingCallback

unique_topics = list(counts.keys())

class TextGenerationCallback(TrainerCallback):
    '''Custom callback: during training, generate text in specified intervals'''
    def __init__(self, model, tokenizer, trigger_freq, k=1):
        self.model = model
        self.tokenizer = tokenizer
        self.trigger_freq = trigger_freq
        self.k = k
    
    def on_step_end(self, args, state, control, **kwargs):
        if not int(state.global_step+1) % self.trigger_freq:
            print(f"Generated text on step {int(state.global_step+1)}:\n")
            for _ in range(self.k):
                # tokenize the prompt and pass it to the gpu
                prompt_text = f"{random.choice(unique_topics)}\n"
                prompt_tokenized = self.tokenizer(prompt_text, return_tensors = 'pt').to(self.model.device)
                # generate tokens
                output = self.model.generate(
                    prompt_tokenized['input_ids'],
                    attention_mask = prompt_tokenized["attention_mask"],
                    max_length=50,
                    do_sample=True,
                    temperature=0.3,  
                    top_k=5,         
                    top_p=0.9,       
                    repetition_penalty=1.2,
                    pad_token_id=50256
                )
                # decode the generated tokens into text
                generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
                print(f"\n{generated_text}\n")
    
            print()


class LearningRateCallback(TrainerCallback):
    '''Custom callback: print the learning rate at the start of each epoch'''
    def on_epoch_begin(self, args, state, control, **kwargs):
        # access the optimizer from the trainer
        optimizer = kwargs["optimizer"]
        # get the current learning rate from the optimizer
        current_lr = optimizer.param_groups[0]["lr"]
        print(f"Epoch {int(state.epoch)+1}\nLearning Rate: {current_lr}")


# define training arguments
training_args = TrainingArguments(
    output_dir = './training_results', # Directory to save the model
    overwrite_output_dir = True,
    learning_rate = 5e-5,
    lr_scheduler_type = 'linear', # the learning rate increases to its max value (warms up) then starts to decrease 
    warmup_steps = 3000, 
    per_device_train_batch_size = 64, # batch size passed to each gpu
    num_train_epochs = 10, # number of epochs
    save_strategy = 'epoch', # save a checkpoint after each epoch
    save_total_limit = 5, # limit the number of saved checkpoints
    fp16 = True, # use mixed precision for more speed
    report_to=[], # turn off logging 
)


lr_tracker = LearningRateCallback()
gen_callback = TextGenerationCallback(model=model, tokenizer=tokenizer, trigger_freq=50, k=3)
# initialize the trainer
trainer = Trainer(
    model = model, 
    args = training_args,
    # it was decided to train the model on the full dataset and use human evaluation instead of a validation set with loss computation
    train_dataset = dataset_tokenized,
    processing_class = tokenizer,
    callbacks = [lr_tracker, gen_callback]
)

# start fine tuning
trainer.train()

# save the fine tuned model
model.save_pretrained("./distilgpt2-finetuned")
tokenizer.save_pretrained("./dstilgpt2-finetuned")

Epoch 1
Learning Rate: 0.0


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.6543
1000,1.4828
1500,1.4265
2000,1.3759
2500,1.3179
3000,1.263
3500,1.2145


Generated text on step 50:


sports
The first thing I did after my return was to the gym and do some stretching. It's a great way of getting your body ready for training, but it can also be very dangerous if you don't get enough rest in between workouts.


literature

The following is a list of the most common words used in this article:


television


, the first of its kind in a new series. The show is based on an original story by writer and director/producer David Fincher (The Hangover , A Beautiful Mind ) who has written for such shows as


Generated text on step 100:


music



geography
The following is a list of the most common and rare items in Fallout 3.


religion



Generated text on step 150:


religion



general



music



Generated text on step 200:


literature
The story of the first novel by Ernest Hemingway is about a young boy named William, who was born in 1854. He has been raised to be an honest man and he becomes involved with politics when his father dies at ag

RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 2455647744 vs 2455647632