# Step 1: Data Loading and Preprocessing

## 1.1 | Imports & Setup

In [6]:
import pandas as pd
from pathlib import Path
import json
import numpy as np
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
from tensorflow.keras import layers, models
from pathlib import Path
from tensorflow.keras.callbacks import ModelCheckpoint, BackupAndRestore, EarlyStopping
from google.colab import drive
import glob, os, re
from pathlib import Path

[]


## 1.2 | Load the Raw Dataset

In [7]:
!pip install gdown
!gdown --id 1pS1hl9Iw5Y1jaFIQvQjzMyH-7P_2mQpF --output freecodecamp_casual_chatroom.csv

data_path = Path("freecodecamp_casual_chatroom.csv")
df = pd.read_csv(data_path)

# Quick sanity-check
display(df.head())


Downloading...
From (original): https://drive.google.com/uc?id=1pS1hl9Iw5Y1jaFIQvQjzMyH-7P_2mQpF
From (redirected): https://drive.google.com/uc?id=1pS1hl9Iw5Y1jaFIQvQjzMyH-7P_2mQpF&confirm=t&uuid=ad76c0e2-5e74-4a42-9cbe-04d232291dde
To: /content/freecodecamp_casual_chatroom.csv
100% 2.69G/2.69G [00:33<00:00, 79.3MB/s]


  df = pd.read_csv(data_path)


Unnamed: 0.1,Unnamed: 0,editedAt,fromUser,fromUser.avatarUrl,fromUser.avatarUrlMedium,fromUser.avatarUrlSmall,fromUser.displayName,fromUser.gv,fromUser.id,fromUser.removed,...,issues,mentions,meta,readBy,sent,status,text,unread,urls,v
0,0,,,,https://avatars2.githubusercontent.com/u/21996...,https://avatars2.githubusercontent.com/u/21996...,Charles Watson,3.0,546fd99cdb8155e6700d6ec8,,...,[],[],[],22,2014-12-31T23:01:35.647Z,,no legumes either,False,[],1
1,1,,,,https://avatars0.githubusercontent.com/u/10361...,https://avatars0.githubusercontent.com/u/10361...,janetwalters008,,54a47e0cdb8155e6700e486e,,...,[],[],[],21,2014-12-31T23:02:51.600Z,,That bullet proof coffee sounds insane.,False,[],1
2,2,,,,https://avatars0.githubusercontent.com/u/10361...,https://avatars0.githubusercontent.com/u/10361...,janetwalters008,,54a47e0cdb8155e6700e486e,,...,[],[],[],21,2014-12-31T23:03:14.221Z,,That guy has huge eyes.,False,[],1
3,3,,,,https://avatars2.githubusercontent.com/u/21996...,https://avatars2.githubusercontent.com/u/21996...,Charles Watson,3.0,546fd99cdb8155e6700d6ec8,,...,[],"[{'userIds': [], 'userId': '54a47e0cdb8155e670...",[],22,2014-12-31T23:03:20.182Z,,@janetwalters008 It is. but it works. some peo...,False,[],1
4,4,,,,https://avatars0.githubusercontent.com/u/42194...,https://avatars0.githubusercontent.com/u/42194...,Paul Gilliam,3.0,54a44bbbdb8155e6700e47de,,...,[],[],[],22,2014-12-31T23:03:38.388Z,,They guy that came up with the idea is kind of...,False,[],1


## 1.3 | Basic Cleaning

#### Keep only non-bot rows, drop missing messages, and lowercase everything.


In [8]:
# 1.3-a) identify bot rows  ➜  tweak the substring as needed
bot_mask = (
    df["fromUser.username"]
      .fillna("")                     # make sure NaNs don’t error out
      .str.contains("bot", case=False)
)

# 1.3-b) extract & clean the messages
messages = (
    df.loc[~bot_mask, "text"]         # keep rows that are NOT bots
      .dropna()                       # remove NaNs
      .str.lower()                    # normalize case
)

print(f"Kept {len(messages):,} messages after bot-filtering")


Kept 4,615,073 messages after bot-filtering


## 1.4 | Build One Large Corpus String
#### Join all cleaned messages into a single text blob—handy for character-level work.

In [9]:
corpus_text = "\n".join(messages.tolist())
print(f"Corpus length (characters): {len(corpus_text):,}")


Corpus length (characters): 273,562,174


## 1.5 | Extract Unique Characters
#### Create a sorted list of every character that appears at least once.

In [10]:
unique_chars = sorted(set(corpus_text))
print(f"Detected {len(unique_chars)} unique characters:")
print(unique_chars[:120])   # preview first few


Detected 4797 unique characters:
['\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x81', '\x8d', '\x90', '\x92', '\x95', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad']


## 1.6 | Create Char ⇄ Index Mappings
#### Two dictionaries: char_indices (char→int) and indices_char (int→char).

In [11]:
char_indices = {ch: i for i, ch in enumerate(unique_chars)}
indices_char = {i: ch for ch, i in char_indices.items()}


# Step 2: Sequence Generation and Vectorization

## 2.1 | Choose Hyper-parameters maxlen & step
#### ``maxlen`` is the length (in characters) of each training sequence;
#### ``step`` controls how far the sliding window moves.
#### Feel free to tweak; 40/3 is a classic starting point for chat-like data.

In [12]:
maxlen      = 40        # keep or tweak as you like
step        = 8         # larger stride ⇒ ~⅓ training samples vs. step=3
batch_size  = 4096      # on a Colab T4/V100 this still fits easily (< 3 GB)

print(f"Config  • maxlen={maxlen}  • step={step}  • batch_size={batch_size}")


Config  • maxlen=40  • step=8  • batch_size=4096


## 2.2 | Memory-Friendly Data Generator
#### Build a *tf.keras.utils.Sequence* that yields one batch at a time ― nothing huge stays in RAM.

In [13]:
class CharSequenceGenerator(tf.keras.utils.Sequence):
    """
    Generates (X, y) batches for character-level language modelling.
    X shape: (batch_size, maxlen)   dtype uint16
    y shape: (batch_size,)          dtype uint16
    """
    def __init__(self, text, char_indices, maxlen, step, batch_size=1024):
        self.text         = text
        self.char_indices = char_indices
        self.maxlen       = maxlen
        self.step         = step
        self.batch_size   = batch_size

        # -- pre-compute all start positions once
        self.starts = np.arange(0, len(text) - maxlen, step, dtype=np.int32)

    def __len__(self):
        return int(np.ceil(len(self.starts) / self.batch_size))

    def __getitem__(self, idx):
        batch_starts = self.starts[
            idx * self.batch_size : (idx + 1) * self.batch_size]

        X = np.zeros((len(batch_starts), self.maxlen), dtype=np.uint16)
        y = np.zeros((len(batch_starts),),              dtype=np.uint16)

        for i, s in enumerate(batch_starts):
            window     = self.text[s : s + self.maxlen]
            next_char  = self.text[s + self.maxlen]

            X[i] = [self.char_indices[ch] for ch in window]
            y[i] = self.char_indices[next_char]

        return X, y


## 2.3 | Instantiate the Generator
#### Creates ~zero extra RAM; ready for Keras fit().


In [14]:
train_gen = CharSequenceGenerator(
    text         = corpus_text,
    char_indices = char_indices,
    maxlen       = maxlen,
    step         = step,
    batch_size   = batch_size
)

print(f"✅  train_gen ready → {len(train_gen):,} batches per epoch")

✅  train_gen ready → 8,349 batches per epoch


## 2.4 | Sanity-check Shapes & Preview with one batch
#### Confirm dimensions and peek at one encoded sample.

In [15]:
Xb, yb = train_gen[0]
print("Batch shapes :", Xb.shape, yb.shape)      # expect (batch_size, maxlen)

sample_text = ''.join(indices_char[int(i)] for i in Xb[0][:80])
print("First sequence :", repr(sample_text))
print("Next-char label:", indices_char[int(yb[0])])


Batch shapes : (4096, 40) (4096,)
First sequence : 'no legumes either\nthat bullet proof coff'
Next-char label: e


# Step 3: Implement, Train, and Save the LSTM Model
#### A compact character-level LSTM with an embedding, two stacked LSTMs, dropout, and a softmax output.

## 3.1 | Model Architecture
#### A compact character-level LSTM with an embedding, two stacked LSTMs, dropout, and a softmax output.

In [16]:
vocab_size  = len(unique_chars)
embed_dim   = 128        # bigger than before
lstm_units  = 256        # enough capacity, still fits in GPU RAM

model = models.Sequential([
    layers.Input(shape=(maxlen,)),                 # (batch, maxlen)
    layers.Embedding(vocab_size, embed_dim),       # (batch, maxlen, embed)

    layers.LSTM(lstm_units, return_sequences=True),
    layers.Dropout(0.25),

    layers.LSTM(lstm_units),
    layers.Dropout(0.25),

    layers.Dense(vocab_size, activation="softmax") # (batch, vocab)
])

model.summary()


## 3.2 | Compile the Model
Use *Adam*, *sparse_categorical_crossentropy*, and *track accuracy*.

In [17]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss      = "sparse_categorical_crossentropy",
    metrics   = ["accuracy"]
)


## 3.3 | Training Callbacks
#### Save weights after each epoch & stop early if val loss stops improving.

In [18]:
# make sure a directory exists
Path("checkpoints").mkdir(exist_ok=True)

# (A) save **full** model after every epoch
full_ckpt_cb = ModelCheckpoint(
    filepath          = "checkpoints/lstm_char_epoch{epoch:02d}.keras",
    save_weights_only = False,          # <-- FULL model
    save_best_only    = False,          # keep every epoch
    monitor           = "loss",         # use "val_loss" if you have a val-set
    verbose           = 1
)

# (B) optional safety net: auto-resume if Colab disconnects
backup_cb = BackupAndRestore(
    backup_dir = "checkpoints/backup",  # stores periodic snapshots
    save_freq  = "epoch"
)

# (C) early stopping (unchanged)
earlystop_cb = EarlyStopping(
    monitor              = "loss",
    patience             = 3,
    restore_best_weights = True
)


drive.mount('/content/drive')

ckpt_dir = '/content/drive/MyDrive/char_lstm_ckpts'

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath = f"{ckpt_dir}/lstm_epoch{{epoch:02d}}.keras",
    save_weights_only = False
)


ValueError: mount failed

## 3.4 | Train 🚀
#### Fits directly on the memory-friendly generator.


In [None]:
# ------------------------------------------------------------------
# ⬇️ 1) Toggle: set RESUME = True if you want to continue training
RESUME = True            # ← change to True to resume from latest .keras
# ------------------------------------------------------------------

CHECKPOINT_DIR = Path("checkpoints")
CHECKPOINT_DIR.mkdir(exist_ok=True)

# ------------------------------------------------------------------
# find the latest .keras file (e.g., lstm_char_epoch07.keras)
latest_ckpt = None
initial_epoch = 0

if RESUME:
    ckpt_files = sorted(
        glob.glob(str(CHECKPOINT_DIR / "lstm_char_epoch*.keras"))
    )
    if ckpt_files:
        latest_ckpt = ckpt_files[-1]
        # extract epoch number with regex
        m = re.search(r"epoch(\d+)\.keras", latest_ckpt)
        initial_epoch = int(m.group(1)) if m else 0
        print(f"🔄  Resuming from {latest_ckpt} (initial_epoch={initial_epoch})")
        model = tf.keras.models.load_model(latest_ckpt)
    else:
        print("⚠️  No .keras checkpoint found — starting fresh")

# ------------------------------------------------------------------
# training
TOTAL_EPOCHS = 10   # final epoch index you want to reach

history = model.fit(
    train_gen,
    epochs        = TOTAL_EPOCHS,
    initial_epoch = initial_epoch,
    callbacks     = [full_ckpt_cb, backup_cb, earlystop_cb]
)


## 3.5 | Save Final Weights & Model
#### Weights (.h5) and the entire model (.keras) for easy reload later.

In [None]:
model.save_weights("lstm_char_final_weights.h5")
model.save("lstm_char_model.keras")   # full model (arch + weights)

print("Weights saved to lstm_char_final_weights.h5")
print("Full model saved to lstm_char_model.keras")
