In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-data/Corpus/ML_corpus.txt


In [None]:
# ---------------------------------------------------------
# FIX ENVIRONMENT BEFORE ANYTHING ELSE
# ---------------------------------------------------------

# 1. Fix protobuf issue (transformers bug on Kaggle)
!pip install protobuf==4.25.3 --quiet --force-reinstall

# 2. Remove TensorFlow to avoid cuDNN/cuFFT/cuBLAS spam + protobuf conflict
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu keras

# 3. Fix pyarrow to correct version (Kaggle uses pyarrow-19.x)
!pip install pyarrow==19.0.0 --quiet --force-reinstall --no-deps

# 4. Install ONLY the necessary libraries WITHOUT upgrading pyarrow again
!pip install transformers==4.53.3 datasets==2.19.1 accelerate tokenizers==0.21.2 --quiet

In [1]:
import os
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2TokenizerFast

CORPUS_PATH = "/kaggle/input/ml-data/Corpus/ML_corpus.txt"
TOKENIZER_DIR = "./custom_tokenizer"

print("Corpus exists:", os.path.exists(CORPUS_PATH))

Corpus exists: True


In [3]:
if not os.path.exists(TOKENIZER_DIR):
    os.makedirs(TOKENIZER_DIR)

print("Training custom tokenizer...")

tokenizer_raw = ByteLevelBPETokenizer()

tokenizer_raw.train(
    files=CORPUS_PATH,
    vocab_size=24000,
    min_frequency=2,
    special_tokens=["<|pad|>","<|bos|>","<|eos|>","<|unk|>"]
)

tokenizer_raw.save_model(TOKENIZER_DIR)

tokenizer = GPT2TokenizerFast.from_pretrained(
    TOKENIZER_DIR,
    pad_token="<|pad|>",
    bos_token="<|bos|>",
    eos_token="<|eos|>",
    unk_token="<|unk|>"
)

print("Custom tokenizer vocab size:", tokenizer.vocab_size)

Training custom tokenizer...



Custom tokenizer vocab size: 24000


In [11]:
from transformers import GPT2Config, GPT2LMHeadModel

CONTEXT_SIZE = 384

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=384,
    n_ctx=384,
    n_embd=384,
    n_layer=10,
    n_head=6
)

model = GPT2LMHeadModel(config)
model.resize_token_embeddings(tokenizer.vocab_size)

print("Model parameters (M):", round(sum(p.numel() for p in model.parameters())/1e6, 3))

Model parameters (M): 27.109


In [5]:
from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": CORPUS_PATH})
dataset = dataset["train"].train_test_split(test_size=0.01, seed=42)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=CONTEXT_SIZE,
        return_overflowing_tokens=False,
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    num_proc=1,
    remove_columns=["text"]
)

tokenized = tokenized.filter(lambda x: len(x["input_ids"]) > 0)

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=1):   0%|          | 0/1756453 [00:00<?, ? examples/s]

Map (num_proc=1):   0%|          | 0/17742 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1756453 [00:00<?, ? examples/s]

Filter:   0%|          | 0/17742 [00:00<?, ? examples/s]

In [6]:
import numpy as np

train_max = np.max([np.max(x) for x in tokenized["train"]["input_ids"]])
test_max = np.max([np.max(x) for x in tokenized["test"]["input_ids"]])

print("vocab size:", tokenizer.vocab_size)
print("max train id:", train_max)
print("max test id:", test_max)

vocab size: 24000
max train id: 23999
max test id: 23999


In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ml_gpt_model",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=5e-4,
    warmup_steps=200,
    weight_decay=0.1,

    logging_steps=500,

    eval_strategy="steps",
    eval_steps=10000,
    per_device_eval_batch_size=16,
    prediction_loss_only=True,

    save_strategy="no",
    bf16=True,
    report_to="none"
)

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"]
)

In [15]:
trainer.train()

Step,Training Loss,Validation Loss
10000,1.8416,3.623037
20000,1.7211,3.401062
30000,1.6527,3.270133




TrainOutput(global_step=37810, training_loss=1.796148332960547, metrics={'train_runtime': 33876.3197, 'train_samples_per_second': 71.428, 'train_steps_per_second': 1.116, 'total_flos': 8.667233317239091e+16, 'train_loss': 1.796148332960547, 'epoch': 2.0})

In [16]:
trainer.save_model("./ml_gpt_model")
tokenizer.save_pretrained("./ml_gpt_model")

('./ml_gpt_model/tokenizer_config.json',
 './ml_gpt_model/special_tokens_map.json',
 './ml_gpt_model/vocab.json',
 './ml_gpt_model/merges.txt',
 './ml_gpt_model/added_tokens.json',
 './ml_gpt_model/tokenizer.json')

In [17]:
!zip -r model.zip ./ml_gpt_model
from IPython.display import FileLink
FileLink('model.zip')

  adding: ml_gpt_model/ (stored 0%)
  adding: ml_gpt_model/special_tokens_map.json (deflated 78%)
  adding: ml_gpt_model/generation_config.json (deflated 24%)
  adding: ml_gpt_model/tokenizer.json (deflated 82%)
  adding: ml_gpt_model/training_args.bin (deflated 52%)
  adding: ml_gpt_model/model.safetensors (deflated 7%)
  adding: ml_gpt_model/tokenizer_config.json (deflated 73%)
  adding: ml_gpt_model/vocab.json (deflated 60%)
  adding: ml_gpt_model/config.json (deflated 51%)
  adding: ml_gpt_model/merges.txt (deflated 57%)
