<a href="https://colab.research.google.com/github/123vartika123/Fine-Tuning-GPT-Models-for-Customized-Text-Generation/blob/main/Step_by_Step_Guide_to_Training_a_GPT_Model_with_Sample_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Handle padding

# Load your dataset
dataset = load_dataset('text', data_files='sample_data.txt')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,  # Increase epochs
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=GPT2LMHeadModel.from_pretrained("gpt2"),
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

# Text generation
input_text = "Fine-tuning models"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

output = trainer.model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=50,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

# Output generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Step,Training Loss
10,3.1861


Fine-tuning models are a great way to get a better understanding of how a model is doing something.
The best way to get a better understanding of a model is a model.

The best way to get a better understanding of a


Generate Text

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")

# Test the model with some input text
input_text = "Fine-tuning models"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Create an attention mask (1 for actual tokens, 0 for padding)
attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

# Generate text with attention mask
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,  # Adjust length as needed
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Fine-tuning models can help them perform better on tasks.

The new model can help them perform better on tasks. The new model can help them perform better on tasks. The new model can help them perform better on tasks. The new


Coherence Check

In [9]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine

# Load BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Use the [CLS] token representation as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
    return cls_embedding.squeeze().numpy()  # Flatten to 1-D

# Example reference text and generated text
reference_text = "Fine-tuning models can improve performance."
generated_text = "Fine-tuning models can help them perform better on tasks."

# Compute embeddings
ref_embedding = get_bert_embedding(reference_text)
gen_embedding = get_bert_embedding(generated_text)

# Compute cosine similarity
similarity = 1 - cosine(ref_embedding, gen_embedding)
print(f"Coherence Similarity: {similarity:.4f}")


Coherence Similarity: 0.9468


Relevance check

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Example prompt and generated text
prompt = "Fine-tuning models"
generated_text = "Fine-tuning models can help them perform better on tasks."

# Vectorize the texts
tfidf_matrix = vectorizer.fit_transform([prompt, generated_text])
similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

print(f"Relevance Similarity: {similarity_matrix[0][0]:.4f}")


Relevance Similarity: 0.4222


Creativity Entropy check

In [8]:
from collections import Counter
import math

def calculate_entropy(text):
    tokens = text.split()
    token_counts = Counter(tokens)
    total_tokens = len(tokens)
    entropy = -sum((count / total_tokens) * math.log2(count / total_tokens) for count in token_counts.values())
    return entropy

# Example generated text
generated_text = "Fine-tuning models can help them perform better on tasks."

# Calculate entropy
entropy = calculate_entropy(generated_text)
print(f"Creativity Entropy: {entropy:.4f}")


Creativity Entropy: 3.1699
