In [1]:
!pip install -q datasets nltk tqdm

In [4]:
from datasets import load_dataset

# Use streaming=True to avoid full download
dataset = load_dataset("HuggingFaceFW/fineweb", split="train", streaming=True)

# Take only the first 100k samples into memory
sample = []
for i, row in enumerate(dataset):
    if i >= 100_000:  # stop after 100k
        break
    sample.append(row["text"])

print("Pulled {len(sample)} samples safely without downloading entire dataset.")


Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

Pulled {len(sample)} samples safely without downloading entire dataset.


In [5]:
import nltk, os, json
from tqdm import tqdm
from nltk.tokenize import word_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Clean text (remove extra spaces)
cleaned = [" ".join(t.split()) for t in sample if isinstance(t, str) and len(t.strip()) > 0]

In [8]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
# Quick word-count sanity check
lengths = [len(word_tokenize(t)) for t in tqdm(cleaned, desc="Counting words")]
print(f"Average words/document: {sum(lengths)/len(lengths):.2f}")
print(f"Max words/document: {max(lengths)}")

Counting words: 100%|██████████| 100000/100000 [03:58<00:00, 418.65it/s]

Average words/document: 567.57
Max words/document: 104698





In [11]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from tokenizers.normalizers import NFKC
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing
from tokenizers import decoders
from tqdm import tqdm
import json, os

# Load dataset
# data_path = "out"  # path where fineweb_shard_*.jsonl files are stored
# texts = []
# for filename in os.listdir(data_path):
#     if filename.endswith(".jsonl"):
#         with open(os.path.join(data_path, filename), "r", encoding="utf-8") as f:
#             for line in f:
#                 obj = json.loads(line)
#                 texts.append(obj["text"])

# Use the 'sample' variable loaded in the previous cell
texts = sample

print(f"Loaded {len(texts)} documents for tokenizer training.")

# Initialize tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Trainer configuration
trainer = BpeTrainer(
    vocab_size=32000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<mask>"]
)

# Train tokenizer
tokenizer.train_from_iterator(tqdm(texts, desc="Training tokenizer"), trainer=trainer)
print("Tokenizer training complete.")

# Add post-processing for BOS/EOS tokens
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> </s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

# Save tokenizer files
os.makedirs("slm_tokenizer", exist_ok=True)
tokenizer.save("slm_tokenizer/tokenizer.json")
print("Tokenizer saved at ./slm_tokenizer/tokenizer.json")

# Quick validation
encoded = tokenizer.encode("FineWeb dataset is awesome for training small LLMs!")
print("Sample tokens:", encoded.tokens)
print("Token IDs:", encoded.ids)

Loaded 100000 documents for tokenizer training.


Training tokenizer: 100%|██████████| 100000/100000 [00:09<00:00, 11035.77it/s]


Tokenizer training complete.
Tokenizer saved at ./slm_tokenizer/tokenizer.json
Sample tokens: ['<s>', 'Fine', 'Web', 'datas', 'et', 'is', 'awesome', 'for', 'training', 'small', 'LL', 'Ms', '!', '</s>']
Token IDs: [2, 12478, 4364, 27346, 2669, 2636, 7423, 2652, 4657, 3541, 7202, 7857, 5, 3]


In [15]:
# TRAIN SMALL LANGUAGE MODEL

from transformers import (
    AutoTokenizer,
    GPT2Config,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizerFast # Import PreTrainedTokenizerFast
)
from datasets import Dataset
import torch, json, os
from tokenizers import Tokenizer # Import Tokenizer from tokenizers library

# Load your trained tokenizer from the saved file
tokenizer_file = "slm_tokenizer/tokenizer.json"
if not os.path.exists(tokenizer_file):
    raise FileNotFoundError(f"Tokenizer file not found at {tokenizer_file}")

# Load the tokenizer using the tokenizers library
custom_tokenizer = Tokenizer.from_file(tokenizer_file)

# Wrap the custom tokenizer in a transformers compatible tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=custom_tokenizer,
    # Add special tokens if they are not automatically recognized
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    mask_token="<mask>"
)


# Prepare dataset
# Use the 'sample' variable loaded in the previous cell instead of reading from 'out' directory
# data = []
# for shard_file in os.listdir("out"):
#     if shard_file.endswith(".jsonl"):
#         with open(os.path.join("out", shard_file), "r", encoding="utf-8") as f:
#             for line in f:
#                 data.append(json.loads(line)["text"])
# dataset = Dataset.from_dict({"text": data})

# Use the cleaned data from the previous cell
dataset = Dataset.from_dict({"text": cleaned})


def tokenize_function(examples):
    # Ensure that the text is a string before tokenizing
    texts_to_tokenize = [text if isinstance(text, str) else "" for text in examples["text"]]
    return tokenizer(texts_to_tokenize, truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Split train/validation
split = tokenized_dataset.train_test_split(test_size=0.05)
train_dataset = split["train"]
eval_dataset = split["test"]

# Define GPT-like model configuration
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4,
    bos_token_id=tokenizer.convert_tokens_to_ids("<s>"),
    eos_token_id=tokenizer.convert_tokens_to_ids("</s>"),
    pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
)

model = GPT2LMHeadModel(config)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training setup
training_args = TrainingArguments(
    output_dir="slm_model",
    eval_strategy="epoch", # Changed from evaluation_strategy
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,  # try 1-3 for now
    save_total_limit=2,
    logging_steps=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer, # Use the wrapped tokenizer
    data_collator=data_collator,
)

trainer.train()

print("Training complete. Model saved in ./slm_model/")

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,5.64,5.53194
2,5.3294,5.308854


Training complete. Model saved in ./slm_model/


In [17]:
from transformers import pipeline, AutoModelForCausalLM, PreTrainedTokenizerFast
import os
from tokenizers import Tokenizer

# Define the path to the trained model checkpoint and tokenizer
model_path = "./slm_model/checkpoint-47500" # Use the path to the latest checkpoint
tokenizer_file = "./slm_tokenizer/tokenizer.json"

# Load the tokenizer using the tokenizers library and wrap it
if not os.path.exists(tokenizer_file):
    raise FileNotFoundError(f"Tokenizer file not found at {tokenizer_file}")
custom_tokenizer = Tokenizer.from_file(tokenizer_file)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=custom_tokenizer,
                                     pad_token="<pad>",
                                     bos_token="<s>",
                                     eos_token="</s>",
                                     unk_token="<unk>",
                                     mask_token="<mask>")


# Load the model
model = AutoModelForCausalLM.from_pretrained(model_path)


# Create the pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
print(generator("The future of hospitality AI is", max_length=50, num_return_sequences=1))

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'The future of hospitality AI is You can create a new digital and professional and elegant and unique design products. This product is designed for our latest design and innovative and design, and design and provides an online quality quality, durability and advanced quality of the style. The design of the design is based on an ideal way of a variety of style. With our unique collection of our range of quality and innovative, the product is a wide collection of professional design, we also offer an exceptional and unique range of products, offering some of our clients, our customers with our creative, and products. - The design and design of our products are'}]


In [18]:
!ls -R ./slm_model

./slm_model:
checkpoint-47000  checkpoint-47500  runs

./slm_model/checkpoint-47000:
config.json		rng_state.pth		 tokenizer.json
generation_config.json	scheduler.pt		 trainer_state.json
model.safetensors	special_tokens_map.json  training_args.bin
optimizer.pt		tokenizer_config.json

./slm_model/checkpoint-47500:
config.json		rng_state.pth		 tokenizer.json
generation_config.json	scheduler.pt		 trainer_state.json
model.safetensors	special_tokens_map.json  training_args.bin
optimizer.pt		tokenizer_config.json

./slm_model/runs:
Oct22_06-28-57_d4363890fad3  Oct22_06-42-31_d4363890fad3

./slm_model/runs/Oct22_06-28-57_d4363890fad3:
events.out.tfevents.1761114540.d4363890fad3.1010.0

./slm_model/runs/Oct22_06-42-31_d4363890fad3:
events.out.tfevents.1761115351.d4363890fad3.1010.1


In [20]:
import math

eval_results = trainer.evaluate()

# Compute Perplexity
perplexity = math.exp(eval_results["eval_loss"])
print(f"\n🔹 Evaluation Loss: {eval_results['eval_loss']:.4f}")
print(f"🔹 Perplexity (PPL): {perplexity:.2f}")


🔹 Evaluation Loss: 5.3089
🔹 Perplexity (PPL): 202.12


Evaluation Loss — 5.3089

This is the average cross-entropy loss across the validation set.
It represents how well the model predicts the next token in a sequence.
Lower loss = better model fit.

Interpretation:
A loss of around 5.3 means that on average, the model is somewhat uncertain when predicting the next word — it often assigns only a moderate probability to the correct next token.
For a small language model (SLM) trained on 100K samples, this is quite reasonable.

Context:
In large-scale LLMs like GPT-2 or Phi-2, evaluation loss typically falls between 1.5–3.0 after training on billions of tokens.
Since the model is trained on a much smaller dataset and for only 2 epochs, a loss in the 5–6 range indicates that learning has occurred but there’s still substantial room for improvement.