In [None]:
# 1. Install Dependencies
!pip install datasets transformers accelerate tokenizers peft

In [None]:
# 2. Imports
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
# 3. Load Dataset & Model
datasets = load_dataset("text", data_files='football.txt')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
# 4. Tokenization
def tokenize_function(examples):
    tokenizer_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    tokenizer_inputs["labels"] = tokenizer_inputs["input_ids"].copy()
    return tokenizer_inputs
tokenized_dataset = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
# 5. Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

In [None]:
# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./Lora_football",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2
)

In [None]:
# 7. Train
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)

trainer.train()

In [None]:
# 8. Save Model
peft_model.save_pretrained("./Lora_football")

# 9. Inference (Test)
input_prompt = "When was world cup"
generator = pipeline("text-generation", model="./Lora_football", tokenizer=tokenizer)
result = generator(input_prompt, max_length=100, do_sample=True)
print(result[0]["generated_text"])