In [1]:
import torch
from tokenizer import Tokenizer

In [2]:
with open("./resources/harrypotter.txt", 'r') as f:
    text = f.read()

token = Tokenizer.load_json(r"resources/TokenizerModel.json")
vocab_size = len(token.vocab)

Tokenizer loaded from resources/TokenizerModel.json


In [3]:
token.encode("Hello")

[72, 101, 285, 111]

In [3]:
data = torch.tensor(token.encode(text), dtype=torch.long)
data.shape, data.dtype

(torch.Size([3109946]), torch.int64)

In [4]:
# Split up the data into train and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
# What's the length of characters does transformer look at once
block_size = 8
print(train_data[:block_size+1])

inputs = train_data[:block_size]
tragets = train_data[1:block_size+1]
for t in range(block_size):
    print(f"When input is {inputs[:t+1]}, the target will be {tragets[t]}")
    

tensor([ 84,  72,  69, 394,  79,  89,  32,  87,  72])
When input is tensor([84]), the target will be 72
When input is tensor([84, 72]), the target will be 69
When input is tensor([84, 72, 69]), the target will be 394
When input is tensor([ 84,  72,  69, 394]), the target will be 79
When input is tensor([ 84,  72,  69, 394,  79]), the target will be 89
When input is tensor([ 84,  72,  69, 394,  79,  89]), the target will be 32
When input is tensor([ 84,  72,  69, 394,  79,  89,  32]), the target will be 87
When input is tensor([ 84,  72,  69, 394,  79,  89,  32,  87]), the target will be 72


In [6]:
batch_size = 4
block_size = 8
def get_batch(data:torch.Tensor):
    # Generate random start indexs from 0 to len(data) - block_size 
    # Here minus block_size is to avoid going out of bound
    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch(train_data)
print('input:')
print(xb.shape)
print(xb)
print('target:')
print(yb.shape)
print(yb)

input:
torch.Size([4, 8])
tensor([[291, 328, 373, 295, 108, 274, 101, 272],
        [304, 272,  72, 372, 114, 294, 312, 337],
        [105, 309,  32, 301, 152, 116, 257,  32],
        [101, 109, 335, 283, 115,  32,  33,  89]])
target:
torch.Size([4, 8])
tensor([[328, 373, 295, 108, 274, 101, 272,  79],
        [272,  72, 372, 114, 294, 312, 337, 388],
        [309,  32, 301, 152, 116, 257,  32,  87],
        [109, 335, 283, 115,  32,  33,  89, 101]])


In [7]:
from model import BiagramLanguageModel

blm = BiagramLanguageModel(vocab_size=vocab_size)
xb, yb = get_batch(train_data)
logits, loss = blm(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(token.decode(blm.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 400])
tensor(6.5313, grad_fn=<NllLossBackward0>)
 �v�esom� uer beĭ/f� his� an� kro hѼight� b���ceight�� with���:�agerq�L�le�� g� l�̺�Ped�’ct#Z P\s�ag5�red� wh��ܴ�1 re+) be�{� youheithCF n


In [8]:
sentences = "Hello, what's your name?"
inputs = token.encode(sentences)
print(token.decode((blm.generate(idx, max_new_tokens=30)[0].tolist())))

 �4ll me BH  andic�1 c wasut h andisYp�her���Ʉai\


In [9]:
from tqdm import tqdm

optimizer = torch.optim.AdamW(blm.parameters(), lr = 1e-3)

def get_batch(data:torch.Tensor, batch_size, block_size):
    # Generate random start indexs from 0 to len(data) - block_size 
    # Here minus block_size is to avoid going out of bound
    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def train(n_epochs, batch_size, block_size, 
          model:torch.nn.Module,
          optimizer:torch.optim.Optimizer):
    for step in tqdm(range(n_epochs), desc="Training progress"):
        # sample a batch of data
        xb, yb = get_batch(train_data, batch_size=4, block_size=8)

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(loss.item())

train(10000,
      batch_size=32,
      block_size=8,
      model=blm,
      optimizer=optimizer
      )

Training progress: 100%|██████████| 10000/10000 [00:05<00:00, 1953.28it/s]

4.184420108795166





In [None]:
sentences = "This ebook is for"
inputs = torch.tensor(token.encode(sentences)).unsqueeze(0)
print(token.decode((blm.generate(inputs, max_new_tokens=500)[0].tolist())))
print(blm.generate(inputs, max_new_tokens=500)[0].tolist())

This ebook is for�ipp o coMftervenirrut of as�'�*K�r bitrier�� herter8V�� m��� finut have be an�L u�cL�� m- lit��gh youddore͗ we:t him motis�� you to be BXNiceve sa me asked H n�ce hnd�uthing wif� him ?Lew継 kill in an q��\ fep up Harry sillgw m� trou animxyletr bill looking the oP� old con�et meet Harry Illow wh�id it w ve pre beway be he sus hes y:se hooday inst sa� rest���ސ�ter��䦼I�C`ropperm a sken hair�� moose on withk in Harryif to Pran th~�way neat Duf .I ?ll muppnt it their�j��*�K�� meithҶ3�he<�Į Peluse aclegain was������ shx !�amxchinitionum�� he8 be an� Sirels are A’B��v at fuded themion wh��_ son in s[U Pom on the scoar mean I dre dow j�ot he�ifri�_ce shabotore hą�oringsorfries� that do and saI CKj��s ont ne ce fe con� Sat befetly and P	or B��>~ l not to d# wh�z�� Dleeeed that�!Wat he�aced only tum�ve st*etheretidg wo may pas�ʊEDum���Siesd�S than
[84, 104, 278, 316, 98, 385, 32, 278, 355, 337, 357, 60, 309, 62, 69, 280, 265, 53, 320, 268, 272, 72, 387, 12, 21, 124

In [5]:
import torch.nn as nn
embed = nn.Embedding(5, 3)
# example
embed(torch.tensor([1,2,3,4,3,4]))

tensor([[-0.0759,  1.5938,  0.1153],
        [-0.3277, -0.3793,  0.3081],
        [-1.1024,  2.5051, -0.7540],
        [ 0.2020, -0.3995,  0.7628],
        [-1.1024,  2.5051, -0.7540],
        [ 0.2020, -0.3995,  0.7628]], grad_fn=<EmbeddingBackward0>)

In [7]:
q = torch.randn(4, 6, 8) # (B,T,C)
k = torch.randn(4, 6, 8) # (B,T,C)
C = k.shape[-1]
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,T)
wei

tensor([[[ 1.3294,  0.8022, -0.6855,  0.2571,  0.5200, -0.5199],
         [-0.2709, -1.1602, -0.1610, -1.6385, -0.9295,  0.3160],
         [-0.1445, -0.5552,  1.7330, -1.2333,  1.1522,  1.2377],
         [ 0.8068,  0.6399,  0.8289, -1.5993,  0.3871, -1.0086],
         [-0.7899, -0.9224, -0.0248, -1.0654, -0.1568, -0.5824],
         [ 1.6487,  1.4105,  0.3621, -1.4118,  0.0914, -0.7483]],

        [[ 0.7772,  0.5039, -0.3574,  0.2507,  0.0090,  0.9904],
         [-1.1812,  0.3805,  0.4643, -0.0598, -1.2028,  1.9621],
         [-0.0152, -1.9817,  0.1791,  0.5025, -0.2111,  0.5449],
         [ 1.1176,  0.9110, -0.7666, -1.6586, -1.0617,  0.8247],
         [-2.3466, -1.1788,  0.3733,  1.8958,  1.6801, -1.0998],
         [ 0.2486,  0.6390,  0.0359, -1.4515, -0.5020, -0.8609]],

        [[-1.3897,  1.1676,  2.1072, -0.4267,  1.2751,  0.0769],
         [-0.2471, -1.7468, -0.4100, -1.8616,  1.6966, -0.8573],
         [-1.2635,  0.2678,  0.2165,  0.9295, -0.7434,  0.4831],
         [ 0.7771,  3

In [44]:
import unicodedata
import os

def clean_text_nuclear(input_path, output_path):
    print(f"Reading {input_path}...")
    
    # Read the file (ignoring errors so it doesn't crash on bad encoding)
    with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()

    original_len = len(text)

    # 1. Normalize Unicode (turns "café" into "cafe" + accent)
    text = unicodedata.normalize('NFKD', text)
    
    # 2. The Nuclear Option: Encode to ASCII, discarding anything that doesn't fit
    # This turns "Harry’s" into "Harrys" (or strips the quote if it's weird)
    # It removes ALL multi-byte characters.
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 3. Optional: Restore basic punctuation if lost, but usually not needed for ASCII
    # (The encode('ascii') step is usually enough)

    new_len = len(text)
    print(f"Cleaned! Removed {original_len - new_len} bad characters.")
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"Saved clean file to: {output_path}")

# --- RUN IT ---
source_file = "./resources/harrypotter.txt"       # Your bad file
clean_file = "./resources/harrypotter_clean.txt"  # The new good file

clean_text_nuclear(source_file, clean_file)

Reading ./resources/harrypotter.txt...
Cleaned! Removed 480 bad characters.
Saved clean file to: ./resources/harrypotter_clean.txt


In [12]:
# Convert text to pure ASCII
with open("./resources/harrypotter.txt", 'r', encoding='utf-8') as f:
    text = f.read()

# Replace common multi-byte chars with single-byte equivalents
text = text.replace("’", "'")
text = text.replace("“", '"')
text = text.replace("”", '"')
text = text.replace("—", "-")
# Force convert the rest to ASCII, ignoring errors
text = text.encode("ascii", "ignore").decode("ascii")

with open("./resources/harrypotter_clean.txt", 'w', encoding='utf-8') as f:
    f.write(text)

print("Cleaned file saved to ./resources/harrypotter_clean.txt")

Cleaned file saved to ./resources/harrypotter_clean.txt


In [45]:
import pandas as pd

# Replace with your actual csv filename
csv_file = "./data/train.csv" 

# Read just the first 5 rows to see the columns
df = pd.read_csv(csv_file, nrows=5)
print("Columns found:", df.columns.tolist())

Columns found: ['text', 'meta']


In [None]:
import pandas as pd
from tqdm import tqdm
import os

# ---------------- CONFIGURATION ----------------
# Change this to match your actual file name
INPUT_CSV = "/media/danny/SuperMoose/Data/nanoGPT/train.csv"
OUTPUT_TXT = "/media/danny/SuperMoose/Data/nanoGPT/pile_clean.txt"
TEXT_COLUMN = "text"  
CHUNK_SIZE = 10000    # Reads 10,000 rows at a time (saves RAM)
# -----------------------------------------------

def normalize_text(text):
    """
    Cleans the text to prevent 'Unknown Character' errors.
    """
    if not isinstance(text, str):
        return ""
    
    # 1. Fix the specific characters seen in your snippet
    text = text.replace('“', '"').replace('”', '"') # Fix: “Survival” -> "Survival"
    text = text.replace("‘", "'").replace("’", "'") # Fix: There’s -> There's
    text = text.replace('—', '-')
    
    # 2. Force ASCII (The Nuclear Option)
    # This removes any other weird symbols (emojis, chinese chars, etc)
    # that your Bigram model cannot handle.
    return text.encode('ascii', 'ignore').decode('ascii')

def process_csv_to_txt():
    if not os.path.exists(INPUT_CSV):
        print(f"Error: Could not find {INPUT_CSV}")
        return

    # 1. Count total rows (just for the progress bar display)
    print("Counting total rows (this might take a moment)...")
    try:
        with open(INPUT_CSV, "r", encoding="utf-8", errors="ignore") as f:
            total_rows = sum(1 for _ in f) - 1
    except:
        total_rows = None # If counting fails, just proceed without total
        print("Could not count rows, proceeding anyway...")

    # 2. Process and Save
    print(f"Converting '{TEXT_COLUMN}' from CSV to TXT...")
    
    with open(OUTPUT_TXT, "w", encoding="utf-8") as f_out:
        # Read CSV in chunks using Pandas
        with pd.read_csv(INPUT_CSV, chunksize=CHUNK_SIZE, on_bad_lines='skip') as reader:
            
            pbar = tqdm(total=total_rows, desc="Processing")
            
            for chunk in reader:
                if TEXT_COLUMN not in chunk.columns:
                    print(f"Error: Column '{TEXT_COLUMN}' not found in this chunk!")
                    continue
                
                # Convert content to string and clean it
                clean_lines = chunk[TEXT_COLUMN].astype(str).apply(normalize_text)
                
                # Write to file (join documents with two newlines)
                f_out.write("\n\n".join(clean_lines) + "\n\n")
                
                pbar.update(len(chunk))
            
            pbar.close()

    print("\nSUCCESS!")
    print(f"Clean text saved to: {OUTPUT_TXT}")
    print("You can now train your tokenizer on this file.")

if __name__ == "__main__":
    process_csv_to_txt()

Counting total rows (this might take a moment)...
Converting 'text' from CSV to TXT...


Processing:   2%|▏         | 100000/6558358 [00:04<04:56, 21747.45it/s]


SUCCESS!
Clean text saved to: ./resources/pile_clean.txt
You can now train your tokenizer on this file.



