In [None]:
5# -------------------------------
# 1️⃣ Uninstall conflicting versions
# -------------------------------
!pip uninstall -y torch torchtext torchvision torchaudio numpy

# -------------------------------
# 2️⃣ Install compatible versions
# For vanilla RNN + IMDB
# torch 2.3.0, torchtext 0.18.0, torchvision/torchaudio matching
# numpy 1.26.4 (avoids PyTorch errors)
# -------------------------------
!pip install torch==2.3.0 torchtext==0.18.0 torchvision==0.18.0 torchaudio==2.3.0 numpy==1.26.4 --quiet

# -------------------------------
# 3️⃣ Restart runtime (required to load new versions)
# -------------------------------
import os
os.kill(os.getpid(), 9)  # This forces Colab to restart


Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
[0mFound existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
[2K   

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!cp "/content/drive/MyDrive/Tensorflow/movie.csv" /content/

In [5]:
import pandas as pd
df = pd.read_csv('movie.csv')


In [6]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [42]:
import math, time, random
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchtext.data.utils import get_tokenizer

In [43]:
r_seed = 42
torch.manual_seed(r_seed)
np.random.seed(r_seed)
random.seed(r_seed)

data_path = 'movie.csv'
text_col = 'Plot'
batch_size = 32
block_size = 64
embed_dim = 128
num_head = 8
num_layers = 2
ff_dim = 4 * embed_dim
dropout = 0.1
lr = 3e-4
epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print_every = 200

In [44]:
print(device)

cuda


In [45]:
df = pd.read_csv(data_path)
texts = df[text_col].astype(str).tolist()


In [46]:
new = texts[: 5]
print(new)

["A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]", "The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.", 'The film, just over a minute long, is composed of two

In [47]:
tokenizer = get_tokenizer('basic_english')

In [48]:
def build_vocab(texts, min_freq = 1, max_vocab = None):
  cnt = Counter()
  for t in texts:
    toks = tokenizer(t)
    cnt.update(toks)
  items = [w for w, f in cnt.most_common() if f >= min_freq]

  if max_vocab:
    items = items[:max_vocab]

  itos = ['<pad>', '<unk>'] + items
  stoi = {w: i for i, w in enumerate(itos)}
  return stoi, itos

In [49]:
stoi, itos = build_vocab(texts, min_freq = 1, max_vocab = 30000)
vocab_size = len(itos)
pad_idx = stoi['<pad>']
unk_idx = stoi['<unk>']
print("Vocab size:", vocab_size)

Vocab size: 30002


In [50]:
all_tokens = []
sep_token = '<sep>'

if sep_token not in stoi:
  stoi[sep_token] = len(itos)
  itos.append(sep_token)
  vocab_size += 1

for t in texts:
  toks = tokenizer(t)
  ids = [stoi.get(tok, unk_idx) for tok in toks]
  ids.append(stoi[sep_token])
  all_tokens.extend(ids)

print('Total tokens length : ', len(all_tokens))

Total tokens length :  15115524


In [51]:
class TokenDataset(Dataset):
    def __init__(self, token_list, block_size, stride=None):
        self.tokens = token_list
        self.block_size = block_size
        self.stride = stride if stride is not None else block_size

        # compute valid start positions
        self.starts = list(range(0, max(0, len(self.tokens) - self.block_size + 1), self.stride))
        self.num_blocks = len(self.starts)

    def __len__(self):
        return self.num_blocks

    def __getitem__(self, i):
        idx = self.starts[i]
        x = torch.tensor(self.tokens[idx : idx + self.block_size], dtype=torch.long)
        y = torch.tensor(self.tokens[idx + 1 : idx + 1 + self.block_size], dtype=torch.long)
        return x, y


In [53]:
dataset = TokenDataset(all_tokens, block_size=64, stride=64)
train_size = int(0.99 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = True, drop_last = True)
val_loader = DataLoader(val_ds, batch_size = batch_size, shuffle = False, drop_last = True)

print("Train blocks :", len(train_ds), "Val blocks :", len(val_ds))

Train blocks : 233818 Val blocks : 2362


In [54]:
class CasualMultiHeadSelfAttention(nn.Module):
  def __init__(self, embed_dim, num_heads, dropout = 0.0):
    super().__init__()
    assert embed_dim % num_heads == 0
    self.embed_dim = embed_dim
    self.num_heads = num_heads
    self.head_dim = embed_dim // num_heads

    self.W_q = nn.Linear(embed_dim, embed_dim)
    self.W_k = nn.Linear(embed_dim, embed_dim)
    self.W_v = nn.Linear(embed_dim, embed_dim)
    self.out = nn.Linear(embed_dim, embed_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, attn_mask = None):
    B, L, D = x.shape
    Q = self.W_q(x).view(B, L, self.num_heads, self.head_dim).transpose(1,2)
    K = self.W_k(x).view(B, L, self.num_heads, self.head_dim).transpose(1,2)
    V = self.W_v(x).view(B, L, self.num_heads, self.head_dim).transpose(1,2)

    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

    if attn_mask is None:
      mask = torch.triu(torch.ones(L, L, device = x.device),diagonal = 1).bool()

      scores = scores.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))

    else:
      scores = scores.masked_fill(attn_mask.unsqueeze(0).unsquueze(0), float('-inf'))


    attn = torch.softmax(scores, dim = -1)
    attn = self.dropout(attn)
    out = torch.matmul(attn, V)
    out = out.transpose(1, 2).contiguous().view(B, L, D)
    out = self.out(out)
    return out, attn

In [55]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, ff_dim, dropout = 0.1):
    super().__init__()
    self.attn = CasualMultiHeadSelfAttention(embed_dim, num_heads, dropout = dropout)
    self.ln1 = nn.LayerNorm(embed_dim)
    self.ff = nn.Sequential(
        nn.Linear(embed_dim, ff_dim),
        nn.GELU(),
        nn.Linear(ff_dim, embed_dim),
        nn.Dropout(dropout)
    )
    self.ln2 = nn.LayerNorm(embed_dim)
    self.dropout = nn.Dropout(dropout)


  def forward(self, x):
    attn_out, attn_weights = self.attn(x)
    x = x + self.dropout(attn_out)
    x = self.ln1(x)
    ff_out = self.ff(x)
    x = x + self.dropout(ff_out)
    x = self.ln2(x)
    return x, attn_weights

In [56]:
class TinyGpt(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, block_size, dropout = 0.1, pad_idx = 0):
    super().__init__()
    self.token_emb = nn.Embedding(vocab_size, embed_dim, padding_idx = pad_idx)
    self.pos_emb = nn.Embedding(block_size, embed_dim)
    self.blocks = nn.ModuleList([DecoderBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
    self.ln_f = nn.LayerNorm(embed_dim)
    self.head = nn.Linear(embed_dim, vocab_size, bias = False)
    self.block_size = block_size

  def forward(self, idx):

    B, L = idx.shape
    assert L <= self.block_size
    token_embeddings = self.token_emb(idx)
    pos_ids = torch.arange(0, L, dtype = torch.long, device = idx.device).unsqueeze(0)
    pos_embeddings = self.pos_emb(pos_ids)
    x = token_embeddings + pos_embeddings
    attn_maps = []
    for block in self.blocks:
      x, attn = block(x)
      attn_maps.append(attn)
    x = self.ln_f(x)
    logits = self.head(x)
    return logits, attn_maps


  @torch.no_grad()
  def generate(self, idx, max_new_tokens, temperature = 1.0, top_k = None):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -self.block_size:]
      logits, _ = self.forward(idx_cond)
      logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
      if top_k is not None:
        v, _ = torch.topk(logits, top_k)
        min_v = v[:, -1].unsqueeze(-1)
        logits = torch.where(logits < min_v, torch.full_like(logits, -float('Inf')), logits)
      probs = F.softmax(logits, dim = -1)
      next_token = torch.multinomial(probs, num_samples = 1)
      idx = torch.cat([idx, next_token], dim = 1)
    return idx

In [57]:
model = TinyGpt(vocab_size = vocab_size, embed_dim = embed_dim, num_heads = num_head,
                num_layers = num_layers, ff_dim = ff_dim, block_size = block_size, dropout = dropout,
                pad_idx = pad_idx).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

print("Model params :", sum(p.numel() for p in model.parameters()) / 1e6, "M")

Model params : 8.08576 M


In [58]:
def estimate_loss():
  model.eval()
  losses = []
  with torch.no_grad():
    for x, y in val_loader:
      x = x.to(device); y = y.to(device)
      logits, _ = model(x)
      loss = criterion(logits.view(-1, vocab_size), y.view(-1))
      losses.append(loss.item())
    model.train()
    return float(np.mean(losses))

best_val = 1e9
global_step = 0
for epoch in range(1, epochs + 1):
  t0 = time.time()
  running_loss = 0.0
  for it, (x,y) in enumerate(train_loader, 1):
    x = x.to(device); y = y.to(device)
    optimizer.zero_grad()
    logits, _ = model(x)
    loss = criterion(logits.view(-1, vocab_size), y.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    running_loss += loss.item()
    global_step += 1

    if global_step % print_every == 0:
      val_loss = estimate_loss()
      print(f'Epoch {epoch} | Step {global_step} | train_loss {running_loss/print_every:.4f} | val loss {val_loss:.4f}')
      running_loss = 0.0

  val_loss = estimate_loss()
  print(f'Epoch {epoch} finished in {time.time()-t0:.1f}s | val loss = {val_loss:.4f}')
  if val_loss < best_val:
    best_val = val_loss
    torch.save(model.state_dict(), 'tiny_gpt.pt')
    print("Saved best model.")

Epoch 1 | Step 200 | train_loss 7.4280 | val loss 6.5610
Epoch 1 | Step 400 | train_loss 6.4290 | val loss 6.2586
Epoch 1 | Step 600 | train_loss 6.1976 | val loss 6.0737
Epoch 1 | Step 800 | train_loss 6.0410 | val loss 5.9491
Epoch 1 | Step 1000 | train_loss 5.9258 | val loss 5.8595
Epoch 1 | Step 1200 | train_loss 5.8585 | val loss 5.7877
Epoch 1 | Step 1400 | train_loss 5.8000 | val loss 5.7335
Epoch 1 | Step 1600 | train_loss 5.7480 | val loss 5.6863
Epoch 1 | Step 1800 | train_loss 5.7113 | val loss 5.6455
Epoch 1 | Step 2000 | train_loss 5.6748 | val loss 5.6101
Epoch 1 | Step 2200 | train_loss 5.6466 | val loss 5.5783
Epoch 1 | Step 2400 | train_loss 5.6098 | val loss 5.5463
Epoch 1 | Step 2600 | train_loss 5.5804 | val loss 5.5191
Epoch 1 | Step 2800 | train_loss 5.5659 | val loss 5.4946
Epoch 1 | Step 3000 | train_loss 5.5384 | val loss 5.4715
Epoch 1 | Step 3200 | train_loss 5.5217 | val loss 5.4501
Epoch 1 | Step 3400 | train_loss 5.5009 | val loss 5.4301
Epoch 1 | Step 360

In [66]:
user = "David was a farmer who loved his family more than anything"

prompt_token = tokenizer(user)
prompt_id = torch.tensor([[stoi.get(t, unk_idx) for t in prompt_token]], dtype=torch.long).to(device)

print('Prompt:', user)

out_idx = model.generate(prompt_id, max_new_tokens=50, temperature=1.0, top_k=50)

# Decode token IDs to text
generated_text = decode(out_idx[0].cpu().tolist())

# ----- CLEANING -----
generated_text = generated_text.replace("<unk>", "")
generated_text = generated_text.replace("<sep>", "")
generated_text = " ".join(generated_text.split())   # remove double spaces

# ----- PRINT SENTENCE BY SENTENCE -----
sentences = generated_text.split('.')
print("\nGenerated Text:\n")
for s in sentences:
    s = s.strip()
    if s:
        print(s + ".\n")


Prompt: David was a farmer who loved his family more than anything

Generated Text:

david was a farmer who loved his family more than anything and not wanting to do anything to pay it.

in the , jim is the one who won ' t win the first time , he was going back.

when the local gangster ' s car arrives , george ' s partner ' s body was left.

