In [None]:
# 0) Setup

!pip -q install transformers datasets accelerate evaluate

import os, torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

MODEL_NAME = "gpt2"  # use "distilgpt2" if you're short on GPU RAM
OUTPUT_DIR = "gpt2-codecraft-ga01"

print("GPU available:", torch.cuda.is_available())


In [None]:
# 1) Upload dataset (data.txt)

from google.colab import files
uploaded = files.upload()  # upload data.txt

# Ensure filename is data.txt
assert "data.txt" in uploaded, "Please upload a file named data.txt"


In [None]:
# Deduplicate lines in data.txt (simple but effective)
with open("data.txt", "r", encoding="utf-8") as f:
    lines = [ln.rstrip() for ln in f.readlines()]

seen = set()
cleaned = []
for ln in lines:
    key = ln.strip()
    if key and key not in seen:
        seen.add(key)
        cleaned.append(ln)

with open("data_clean.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(cleaned))

print("Original lines:", len(lines))
print("Cleaned lines:", len(cleaned))


In [None]:
# 2) Load dataset as HF dataset

dataset = load_dataset("text", data_files={"train": "data_clean.txt"})
print(dataset)
print("Sample:\n", dataset["train"][0]["text"][:300])


In [None]:
# 3) Tokenizer + Model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))


In [None]:
# 4) Tokenize + chunk into blocks

BLOCK_SIZE = 256  # 128/256 works well on Colab

def tokenize_fn(examples):
    return tokenizer(examples["text"])

tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

def group_texts(examples):
    # concatenate then split into BLOCK_SIZE
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = len(concatenated["input_ids"])
    total_len = (total_len // BLOCK_SIZE) * BLOCK_SIZE
    result = {
        k: [t[i:i+BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized.map(group_texts, batched=True)
lm_dataset


In [None]:
# 5) Training

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,   # effective batch size = 16
    num_train_epochs=1,
    learning_rate=2e-5,
    warmup_steps=50,
    logging_steps=20,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    data_collator=data_collator
)

trainer.train()


In [None]:
# 6) Save model

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved to:", OUTPUT_DIR)


In [None]:
from transformers import pipeline, GenerationConfig

def build_generator(model_path_or_name):
    gen = pipeline(
        "text-generation",
        model=model_path_or_name,
        tokenizer=model_path_or_name,
        device=0 if torch.cuda.is_available() else -1
    )
    return gen

def generate(gen, prompt, max_new_tokens=120):
    return gen(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.15,
        no_repeat_ngram_size=3,
        pad_token_id=50256
    )[0]["generated_text"]

prompt = "Topic: Transformers\nWrite a concise note in 5-7 lines:\n"

base_gen = build_generator(MODEL_NAME)
ft_gen = build_generator(OUTPUT_DIR)

print("=== BASE GPT-2 ===")
print(generate(base_gen, prompt))

print("\n=== FINE-TUNED GPT-2 ===")
print(generate(ft_gen, prompt))


In [None]:
# 8) Export artifacts to download / GitHub

!zip -r CODECRAFT_GA_01_artifacts.zip {OUTPUT_DIR}
print("Zipped artifacts.")


In [None]:
import os, shutil

OUTPUT_DIR = "gpt2-codecraft-ga01"
SUBMIT_DIR = "CODECRAFT_GA_01_submit"

# clean old submit folder
if os.path.exists(SUBMIT_DIR):
    shutil.rmtree(SUBMIT_DIR)

os.makedirs(SUBMIT_DIR, exist_ok=True)

# Copy only final model + tokenizer files from OUTPUT_DIR (NOT checkpoints)
keep_files = {
    "config.json",
    "generation_config.json",
    "model.safetensors",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "vocab.json",
    "merges.txt"
}

for f in os.listdir(OUTPUT_DIR):
    src = os.path.join(OUTPUT_DIR, f)
    if os.path.isfile(src) and f in keep_files:
        shutil.copy2(src, os.path.join(SUBMIT_DIR, f))

print("Submit folder files:", os.listdir(SUBMIT_DIR))


In [None]:
!zip -r CODECRAFT_GA_01_submit.zip CODECRAFT_GA_01_submit