In [None]:
!pip install datasets evaluate

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, DatasetDict
import evaluate
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
def load_and_clean_dataset():
    dataset = load_dataset("wikitext", "wikitext-2-v1")
    dataset = dataset.filter(lambda example: example["text"].strip() != "")
    dataset = dataset.map(lambda example: {"text": example["text"].strip()})

    # Reduce dataset size
    dataset["train"] = dataset["train"].select(range(15000))
    dataset["validation"] = dataset["validation"].select(range(2000))
    dataset["test"] = dataset["test"].select(range(2000))

    return dataset

In [None]:
print("Loading and cleaning data...")
dataset = load_and_clean_dataset()

In [None]:
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Using causal language modeling (not masked)
)

In [None]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./text_completion_model_distilgpt2",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=200,
    learning_rate=5e-5,  # Slightly higher learning rate for small dataset
    weight_decay=0.01,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # Reduced epochs
    logging_dir="./logs_small",
    logging_steps=50,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,  # Helps with small batch sizes
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
print("Starting training...")
trainer.train()
print("Training completed!")

In [None]:
trainer.evaluate()

In [None]:
def generate_text(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length + len(inputs.input_ids[0]),
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=2
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
def calculate_perplexity(model, dataset):
    eval_args = TrainingArguments(
        output_dir="./eval_tmp",
        per_device_eval_batch_size=8,
        fp16=torch.cuda.is_available(),
        report_to="none"
    )

    eval_trainer = Trainer(
        model=model,
        args=eval_args,
        data_collator=data_collator
    )

    results = eval_trainer.evaluate(dataset)
    return np.exp(results["eval_loss"])

In [None]:
print("\nEvaluating on validation set...")
val_perplexity = calculate_perplexity(model, tokenized_datasets["validation"])
print(f"Validation Perplexity: {val_perplexity:.2f}")

In [None]:
print("\nEvaluating on test set...")
test_perplexity = calculate_perplexity(model, tokenized_datasets["test"])
print(f"Test Perplexity: {test_perplexity:.2f}")

In [None]:
test_prompts = [
    "The capital of France is",
    "In machine learning,",
    "The main advantage of",
    "The game began development"
]

print("\nText Generation Examples:")
for prompt in test_prompts:
    generated = generate_text(prompt, model, tokenizer)
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated}")

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
model.push_to_hub("ankursinha/text_completion_model_distilgpt2")
tokenizer.push_to_hub("ankursinha/text_completion_model_distilgpt2")