# WikiText-103 with Hugging Face + Needle

This notebook does the following:

1. Uses **Hugging Face `datasets`** to download WikiText-103 (`wikitext-103-v1`).
2. Writes the splits into `wiki.train.tokens`, `wiki.valid.tokens`, `wiki.test.tokens` in a local folder.
3. Uses your existing **`needle.data.datasets.wikitext_dataset`** `Corpus` + `batchify` utilities.
4. Trains and evaluates a language model using your **`train_wikitext`** and **`evaluate_wikitext`** functions from `apps/simple_ml.py`.

In [1]:
# Code to set up the assignment
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/10714/project
!pip3 install pybind11

Mounted at /content/drive
/content/drive/MyDrive/10714/project
Collecting pybind11
  Downloading pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Downloading pybind11-3.0.1-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.6/293.6 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybind11
Successfully installed pybind11-3.0.1


In [2]:
!make

  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

[0m
-- Found pybind11: /usr/local/lib/python3.12/dist-packages/pybind11/include (found version "3.0.1")
-- Found cuda, building cuda backend
Fri Nov 28 04:26:59 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        | 

In [3]:
%set_env PYTHONPATH ./python
%set_env NEEDLE_BACKEND nd

env: PYTHONPATH=./python
env: NEEDLE_BACKEND=nd


In [4]:
import sys
sys.path.append('./python')

In [5]:
import os, math, time, sys
import numpy as np

# Make `python/` visible as a package root
sys.path.append("python")
sys.path.append("apps")

import needle as ndl
import needle.nn as nn
from needle import Tensor

# Training / evaluation helpers
from apps.simple_ml import train_wikitext, evaluate_wikitext

# Your language model definition (adjust class / args as needed)
# from apps.models import LanguageModel  # change if your LM class has a different name
from needle.nn.nn_lm import LanguageModel

# Your WikiText dataset helpers
from needle.data.datasets import wikitext_dataset as wt

# device = ndl.cpu()  # or ndl.cuda() if you wired up a GPU backend
device = ndl.cuda()


Using needle backend


## Download WikiText-103 using Hugging Face `datasets`

This defines a small helper that

- calls `load_dataset("wikitext", "wikitext-103-v1")`
- writes `wiki.train.tokens`, `wiki.valid.tokens`, `wiki.test.tokens`
  into `data_dir`.

In [6]:
def download_wikitext103_hf(data_dir: str = "./wikitext-103", overwrite: bool = False) -> str:
    """Download WikiText-103 via Hugging Face `datasets`.

    Creates three files in `data_dir`:
        - wiki.train.tokens
        - wiki.valid.tokens
        - wiki.test.tokens

    Returns:
        data_dir (str): directory containing the .tokens files.
    """
    os.makedirs(data_dir, exist_ok=True)

    train_f = os.path.join(data_dir, "wiki.train.tokens")
    valid_f = os.path.join(data_dir, "wiki.valid.tokens")
    test_f  = os.path.join(data_dir, "wiki.test.tokens")

    if (not overwrite
        and os.path.exists(train_f)
        and os.path.exists(valid_f)
        and os.path.exists(test_f)):
        print(f"[wikitext] Files already exist in {data_dir}, skipping download.")
        return data_dir

    try:
        from datasets import load_dataset
    except ImportError as e:
        raise RuntimeError(
            "Hugging Face `datasets` is not installed. "
            "Install it with `pip install datasets`."
        ) from e

    print("[wikitext] Downloading WikiText-103 via Hugging Face datasets...")
    ds = load_dataset("wikitext", "wikitext-103-v1")

    def _write_split(split_name: str, out_path: str):
        with open(out_path, "w", encoding="utf-8") as f:
            for row in ds[split_name]:
                # HF can give None for empty lines
                text = row["text"] if row["text"] is not None else ""
                f.write(text.rstrip() + "\n")

    _write_split("train", train_f)
    _write_split("validation", valid_f)
    _write_split("test", test_f)

    print(f"[wikitext] Saved splits to {data_dir}")
    return data_dir


## `Corpus` and batchify

We now:

1. Call the downloader (only downloads the first time).
2. Use `wt.Corpus` + `wt.batchify` to get language-model training data.

In [7]:
# Directory where .tokens files will live
data_dir = "./wikitext-103"  # you can change this

# 1) Download (does nothing if files already exist and overwrite=False)
download_wikitext103_hf(data_dir, overwrite=False)

# 2) Build Corpus
# NOTE: adjust use_subword / vocab size to match your wikitext_dataset implementation.
corpus = wt.Corpus(
    data_dir,
    max_lines=None,          # set to a small int to debug on fewer lines
    use_subword=False,       # True if you added BPE/subword support there
)

vocab_size = corpus.vocab_size
print("Vocab size:", vocab_size)

batch_size = 1

train_data = wt.batchify(corpus.train, batch_size, device=device, dtype="float32")
valid_data = wt.batchify(corpus.valid, batch_size, device=device, dtype="float32")
test_data  = wt.batchify(corpus.test,  batch_size, device=device, dtype="float32")

print("Train data shape:", train_data.shape)
print("Valid data shape:", valid_data.shape)
print("Test  data shape:", test_data.shape)


[wikitext] Files already exist in ./wikitext-103, skipping download.
Vocab size: 267735
Train data shape: (103227021, 1)
Valid data shape: (217646, 1)
Test  data shape: (245569, 1)


## Language model

We have a `LanguageModel` class in `apps/models.py` taking
`vocab_size`, `embedding_size`, `hidden_size`, `num_layers`, `device`, `dtype`.

Change the constructor / class name if your implementation differs.

In [8]:
embedding_size = 512
max_position_embeddings = 1024
learnable_word_embeddings = True
n_layers = 2
block_type = "Transformer"
hidden_size = 512
num_head = 8
dim_head = 64
dropout = 0.1
causal = True
batch_first = True
sequence_len = max_position_embeddings
resid_dropout = 0.
layer_norm_epsilon = 1e-5
pad_vocab_size_multiple = 8
label_smoothing = 0.
tie_word_embeddings = True

# num_experts = 4
# topk = 4

In [9]:
transformer_lm = LanguageModel(
    embedding_size=embedding_size,
    vocab_size=vocab_size,
    max_position_embeddings=max_position_embeddings,
    learnable_word_embeddings=learnable_word_embeddings,
    n_layers=n_layers,
    block_type=block_type,
    hidden_size=hidden_size,
    num_head=num_head,
    dim_head=dim_head,
    dropout=dropout,
    causal=causal,
    batch_first=batch_first,
    sequence_len=sequence_len,
    resid_dropout=resid_dropout,
    layer_norm_epsilon=layer_norm_epsilon,
    pad_vocab_size_multiple=pad_vocab_size_multiple,
    label_smoothing=label_smoothing,
    tie_word_embeddings=tie_word_embeddings,
    device=device,
    dtype="float32",
)

print(transformer_lm)

<needle.nn.nn_lm.LanguageModel object at 0x796766c00740>


In [10]:
embedding_size = 512
max_position_embeddings = 1024
learnable_word_embeddings = True
n_layers = 2
block_type = "TopkMoETransformer"
hidden_size = 512
num_head = 8
dim_head = 64
dropout = 0.1
causal = True
batch_first = True
sequence_len = max_position_embeddings
resid_dropout = 0.
layer_norm_epsilon = 1e-5
pad_vocab_size_multiple = 8
label_smoothing = 0.
tie_word_embeddings = True

num_experts = 4
topk = 2

In [11]:
transformer_moe = LanguageModel(
    embedding_size=embedding_size,
    vocab_size=vocab_size,
    max_position_embeddings=max_position_embeddings,
    learnable_word_embeddings=learnable_word_embeddings,
    n_layers=n_layers,
    block_type=block_type,
    hidden_size=hidden_size,
    num_head=num_head,
    dim_head=dim_head,
    dropout=dropout,
    causal=causal,
    batch_first=batch_first,
    sequence_len=sequence_len,
    resid_dropout=resid_dropout,
    layer_norm_epsilon=layer_norm_epsilon,
    pad_vocab_size_multiple=pad_vocab_size_multiple,
    label_smoothing=label_smoothing,
    tie_word_embeddings=tie_word_embeddings,
    device=device,
    dtype="float32",
    num_experts=num_experts,
    topk=topk
)

print(transformer_moe)

<needle.nn.nn_lm.LanguageModel object at 0x796766c5be60>


## Train on WikiText-103 -- transformer_lm

Call `train_wikitext` function from `apps/simple_ml.py`.
Feel free to tweak `n_epochs`, `lr`, optimizer, etc.

In [12]:
seq_len = 40         # BPTT length
n_epochs = 1
learning_rate = 1e-3
weight_decay = 0.0
clip = 0.25

start_time = time.time()
train_acc, train_loss = train_wikitext(
    transformer_lm,
    train_data,
    seq_len=seq_len,
    n_epochs=n_epochs,
    optimizer=ndl.optim.SGD,   # or ndl.optim.Adam
    lr=learning_rate,
    weight_decay=weight_decay,
    loss_fn=nn.SoftmaxLoss,
    clip=clip,
    device=device,
    dtype="float32",
)
end_time = time.time()

print(f"Training finished in {end_time - start_time:.2f} seconds.")
print(f"Final train loss: {train_loss:.4f}, train acc: {train_acc:.4f}")
print(f"Train perplexity: {math.exp(train_loss):.4f}")


Training WikiText-103:   0%|          | 0/1 [06:00<?, ?it/s]


RuntimeError: out of memory

## Evaluate on validation and test -- transformer_lm


In [None]:
val_acc, val_loss = evaluate_wikitext(
    transformer_lm,
    valid_data,
    seq_len=seq_len,
    loss_fn=nn.SoftmaxLoss,
    device=device,
    dtype="float32",
)
print(f"Valid loss: {val_loss:.4f}, valid acc: {val_acc:.4f}")
print(f"Valid perplexity: {math.exp(val_loss):.4f}")

test_acc, test_loss = evaluate_wikitext(
    transformer_lm,
    test_data,
    seq_len=seq_len,
    loss_fn=nn.SoftmaxLoss,
    device=device,
    dtype="float32",
)
print(f"Test loss: {test_loss:.4f}, test acc: {test_acc:.4f}")
print(f"Test perplexity: {math.exp(test_loss):.4f}")


## Train on WikiText-103 -- transformer_moe

Call `train_wikitext` function from `apps/simple_ml.py`.
Feel free to tweak `n_epochs`, `lr`, optimizer, etc.

In [None]:
seq_len = 40         # BPTT length
n_epochs = 1
learning_rate = 1e-3
weight_decay = 0.0
clip = 0.25

start_time = time.time()
train_acc, train_loss = train_wikitext(
    transformer_moe,
    train_data,
    seq_len=seq_len,
    n_epochs=n_epochs,
    optimizer=ndl.optim.SGD,   # or ndl.optim.Adam
    lr=learning_rate,
    weight_decay=weight_decay,
    loss_fn=nn.SoftmaxLoss,
    clip=clip,
    device=device,
    dtype="float32",
)
end_time = time.time()

print(f"Training finished in {end_time - start_time:.2f} seconds.")
print(f"Final train loss: {train_loss:.4f}, train acc: {train_acc:.4f}")
print(f"Train perplexity: {math.exp(train_loss):.4f}")

## Evaluate on validation and test -- transformer_moe

In [None]:
val_acc, val_loss = evaluate_wikitext(
    transformer_moe,
    valid_data,
    seq_len=seq_len,
    loss_fn=nn.SoftmaxLoss,
    device=device,
    dtype="float32",
)
print(f"Valid loss: {val_loss:.4f}, valid acc: {val_acc:.4f}")
print(f"Valid perplexity: {math.exp(val_loss):.4f}")

test_acc, test_loss = evaluate_wikitext(
    transformer_moe,
    test_data,
    seq_len=seq_len,
    loss_fn=nn.SoftmaxLoss,
    device=device,
    dtype="float32",
)
print(f"Test loss: {test_loss:.4f}, test acc: {test_acc:.4f}")
print(f"Test perplexity: {math.exp(test_loss):.4f}")
