In [1]:
import os
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Set the environment variable to disable W&B if you do not want to use it
os.environ["WANDB_DISABLED"] = "true"

# Define paths
dataset_path = r"C:\Users\Akash\Python\Python Project\Gen AI\ab\dataset.txt"  # Updated path
model_name = "gpt2"  # You can use a different GPT-2 variant if needed

# Load dataset
dataset = load_dataset('text', data_files={'train': dataset_path}, split='train')

# Split dataset into train and eval
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token to tokenizer if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize the dataset
def tokenize_function(examples):
    # Tokenize and prepare labels
    encodings = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
    encodings['labels'] = encodings['input_ids'].copy()  # Labels should be same as input_ids
    return encodings

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Set to 'epoch' to perform evaluation
    save_strategy='epoch',
    report_to='none'  # Disable reporting to W&B
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # Provide the evaluation dataset
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

print("Training complete and model saved.")

# Load the fine-tuned model for text generation
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = './fine-tuned-model'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

def generate_text(prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.8,  # Slightly higher for more creativity
        top_k=50,         # Control diversity
        top_p=0.9,        # Control diversity
        no_repeat_ngram_size=2,  # Avoid repeating n-grams
        pad_token_id=tokenizer.eos_token_id  # Use eos_token_id as pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "Once upon a time in a land far away, the sun will shine forth, and the moon will rise. The sun is the light of the world,"
generated_text = generate_text(prompt)
print(generated_text)


Using pad_token, but it is not set yet.


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 6
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,No log,2.656464
2,No log,1.944859
3,No log,1.623219


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-2
Configuration saved in ./results\checkpoint-2\config.json
Model weights saved in ./results\checkpoint-2\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-4
Configuration saved in ./results\checkpoint-4\config.json
Model weights saved in ./results\checkpoint-4\pytorch_model.bin
The following columns in the evaluation set don't have a corre

Training complete and model saved.


All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./fine-tuned-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


Once upon a time in a land far away, the sun will shine forth, and the moon will rise. The sun is the light of the world, which is in the heavens.

The sun, as it is called, is a light, a shining star, shining in all the earth. It is not a star. Its rays are not rays. They are rays of light.
