In [38]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')




In [26]:
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.read()
        tokenized_text = tokenizer.encode(lines)
        self.examples = [tokenized_text[i:i + block_size] for i in range(0, len(tokenized_text) - block_size + 1, block_size)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])


In [27]:
dataset = TextDataset(file_path='custom_dataset.txt', tokenizer=tokenizer)


In [28]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./gpt2-finetuned',  # Output directory
    overwrite_output_dir=True,      # Overwrite the content of the output directory
    num_train_epochs=10,             # Number of training epochs 3
    per_device_train_batch_size=4,  # Batch size per GPU 2
    save_steps=1000,              # Save checkpoint every 10,000 steps
    save_total_limit=2,             # Limit the total amount of checkpoints
    prediction_loss_only=True,
)

def data_collator(data):
    return {
        'input_ids': torch.stack([f for f in data]),
        'labels': torch.stack([f for f in data]),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()


Step,Training Loss


TrainOutput(global_step=10, training_loss=2.451494598388672, metrics={'train_runtime': 31.9749, 'train_samples_per_second': 1.251, 'train_steps_per_second': 0.313, 'total_flos': 9287006945280.0, 'train_loss': 2.451494598388672, 'epoch': 10.0})

In [29]:
model.save_pretrained('./gpt2-finetuned')
tokenizer.save_pretrained('./gpt2-finetuned')


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [37]:
import re

prompt = "Random forest is an ensemble learning "
input_ids = tokenizer.encode(prompt, return_tensors='pt').to('cuda') # Move input_ids to the GPU

output = model.generate(
    input_ids,
    max_length=150,
    top_p = 0.92,
    temperature = 0.7,
    repetition_penalty=1.2, # Penalize repetition
)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(generated_text)
# Split the text into sentences based on periods
sentences = re.split(r'(?<=\.)\s*', generated_text)

# Print each sentence on a new line with numbering
for i, sentence in enumerate(sentences):
    print(f"{i + 1}: {sentence}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1: Random forest is an ensemble learning  method for classification problems.
2: It uses a random subset of data and weights to classify the output, which means that it can be used in both supervised or unsupervised settings.
3: Random Forest algorithm works by averaging over all possible input values (usually) then selecting one from among them, where each time step takes on average number
randomForest algorithms are widely known for its ability at predicting outcomes based upon their distribution rather than simply using fixed value as inputs; this allows users more control when dealing with complex datasets such like image processing tasks etc.
4: The main drawback of these methods is they tend not to perform well under noisy environments due mainly because there may only ever being enough training set available For example if you have 10 images
