In [1]:
# 0) Setup

!pip -q install transformers datasets accelerate evaluate

import os, torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

MODEL_NAME = "gpt2"  # use "distilgpt2" if you're short on GPU RAM
OUTPUT_DIR = "gpt2-codecraft-ga01"

print("GPU available:", torch.cuda.is_available())


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hGPU available: True


In [2]:
# 1) Upload dataset (data.txt)

from google.colab import files
uploaded = files.upload()  # upload data.txt

# Ensure filename is data.txt
assert "data.txt" in uploaded, "Please upload a file named data.txt"


Saving data.txt to data.txt


In [3]:
# 2) Load dataset as HF dataset

dataset = load_dataset("text", data_files={"train": "data.txt"})
print(dataset)
print("Sample:\n", dataset["train"][0]["text"][:300])


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 524
    })
})
Sample:
 Topic: Prompt Engineering Basics


In [4]:
# 3) Tokenizer + Model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# GPT-2 has no pad token by default; set it safely
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [5]:
# 4) Tokenize + chunk into blocks

BLOCK_SIZE = 256  # 128/256 works well on Colab

def tokenize_fn(examples):
    return tokenizer(examples["text"])

tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

def group_texts(examples):
    # concatenate then split into BLOCK_SIZE
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = len(concatenated["input_ids"])
    total_len = (total_len // BLOCK_SIZE) * BLOCK_SIZE
    result = {
        k: [t[i:i+BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized.map(group_texts, batched=True)
lm_dataset


Map:   0%|          | 0/524 [00:00<?, ? examples/s]

Map:   0%|          | 0/524 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 18
    })
})

In [6]:
# 5) Training

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,   # effective batch size = 16
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=50,
    logging_steps=20,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    data_collator=data_collator
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=6, training_loss=4.960218111673991, metrics={'train_runtime': 38.7767, 'train_samples_per_second': 1.393, 'train_steps_per_second': 0.155, 'total_flos': 7054884864000.0, 'train_loss': 4.960218111673991, 'epoch': 3.0})

In [7]:
# 6) Save model

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved to:", OUTPUT_DIR)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to: gpt2-codecraft-ga01


In [8]:
# 7) Generate text: base vs fine-tuned

from transformers import pipeline

def generate_with(model_path_or_name, prompt, max_new_tokens=120):
    gen = pipeline(
        "text-generation",
        model=model_path_or_name,
        tokenizer=model_path_or_name,
        device=0 if torch.cuda.is_available() else -1
    )
    out = gen(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.9,
        top_p=0.95
    )[0]["generated_text"]
    return out

prompt = "Topic: Transformers\n"
print("=== BASE GPT-2 ===")
print(generate_with(MODEL_NAME, prompt))

print("\n=== FINE-TUNED GPT-2 ===")
print(generate_with(OUTPUT_DIR, prompt))


=== BASE GPT-2 ===


Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Passing `generation_config` together with generation-related arguments=({'top_p', 'max_new_tokens', 'do_sample', 'temperature'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=120) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Topic: Transformers

Publisher: Marvel Comics

Publication Date: 1987

Retail Price: $16.99

TPB: $19.99


The story of the Transformers in the first two volumes of Transformers: Generations tells the story of a young man called Maximus, who is being drawn into the life of a young woman named Jelena. Jelena joins a boy named Optimus in the future. The boy starts to question his relationship with Maximus and Maximus is forced to choose between being a good friend or the other way around. Written by: Mark Millar

=== FINE-TUNED GPT-2 ===


Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

Both `max_new_tokens` (=120) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Topic: Transformers

Topics: Autobots, Transformers, Transformers 2, Transformers 3


Boom Boom

Topics: Doom, Doom 2, Doom 3


Beefy

Topics: Beefy


Beefy

Topics: Beefy


Beefy

Topics: Beefy


Beast Wars

Topics: Beast Wars


Beast Wars

Topics: Beast Wars


Beast Wars

Topics: Beast Wars


Beefy

Topics: Beefy


Beefy

Topics: Beefy


Beast Wars




In [9]:
# 8) Export artifacts to download / GitHub

!zip -r CODECRAFT_GA_01_artifacts.zip {OUTPUT_DIR}
print("Zipped artifacts.")


  adding: gpt2-codecraft-ga01/ (stored 0%)
  adding: gpt2-codecraft-ga01/tokenizer.json (deflated 82%)
  adding: gpt2-codecraft-ga01/training_args.bin (deflated 53%)
  adding: gpt2-codecraft-ga01/tokenizer_config.json (deflated 48%)
  adding: gpt2-codecraft-ga01/model.safetensors (deflated 7%)
  adding: gpt2-codecraft-ga01/config.json (deflated 52%)
  adding: gpt2-codecraft-ga01/checkpoint-2/ (stored 0%)
  adding: gpt2-codecraft-ga01/checkpoint-2/tokenizer.json (deflated 82%)
  adding: gpt2-codecraft-ga01/checkpoint-2/trainer_state.json (deflated 56%)
  adding: gpt2-codecraft-ga01/checkpoint-2/training_args.bin (deflated 53%)
  adding: gpt2-codecraft-ga01/checkpoint-2/tokenizer_config.json (deflated 48%)
  adding: gpt2-codecraft-ga01/checkpoint-2/rng_state.pth (deflated 26%)
  adding: gpt2-codecraft-ga01/checkpoint-2/optimizer.pt (deflated 8%)
  adding: gpt2-codecraft-ga01/checkpoint-2/scaler.pt (deflated 64%)
  adding: gpt2-codecraft-ga01/checkpoint-2/model.safetensors (deflated 7%)
 

In [10]:
import os, shutil

OUTPUT_DIR = "gpt2-codecraft-ga01"
SUBMIT_DIR = "CODECRAFT_GA_01_submit"

# clean old submit folder
if os.path.exists(SUBMIT_DIR):
    shutil.rmtree(SUBMIT_DIR)

os.makedirs(SUBMIT_DIR, exist_ok=True)

# Copy only final model + tokenizer files from OUTPUT_DIR (NOT checkpoints)
keep_files = {
    "config.json",
    "generation_config.json",
    "model.safetensors",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "vocab.json",
    "merges.txt"
}

for f in os.listdir(OUTPUT_DIR):
    src = os.path.join(OUTPUT_DIR, f)
    if os.path.isfile(src) and f in keep_files:
        shutil.copy2(src, os.path.join(SUBMIT_DIR, f))

print("Submit folder files:", os.listdir(SUBMIT_DIR))


Submit folder files: ['tokenizer.json', 'tokenizer_config.json', 'model.safetensors', 'config.json', 'generation_config.json']


In [11]:
!zip -r CODECRAFT_GA_01_submit.zip CODECRAFT_GA_01_submit

  adding: CODECRAFT_GA_01_submit/ (stored 0%)
  adding: CODECRAFT_GA_01_submit/tokenizer.json (deflated 82%)
  adding: CODECRAFT_GA_01_submit/tokenizer_config.json (deflated 48%)
  adding: CODECRAFT_GA_01_submit/model.safetensors (deflated 7%)
  adding: CODECRAFT_GA_01_submit/config.json (deflated 52%)
  adding: CODECRAFT_GA_01_submit/generation_config.json (deflated 25%)
