In [1]:
# Download Tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


--2026-01-04 13:48:21--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2026-01-04 13:48:21 (7.29 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# Read the text file
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("Total characters in dataset:", len(text))
print("\nSample text:\n")
print(text[:500])


Total characters in dataset: 1115394

Sample text:

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [3]:
# Get unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("Vocabulary size:", vocab_size)
print("Characters:", chars)


Vocabulary size: 65
Characters: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
# Character to index and index to character mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}


In [6]:
import torch

# Initialize bigram count matrix
bigram_counts = torch.zeros((vocab_size, vocab_size), dtype=torch.int32)

# Count bigrams
for ch1, ch2 in zip(text[:-1], text[1:]):
    i = stoi[ch1]
    j = stoi[ch2]
    bigram_counts[i, j] += 1

bigram_counts


tensor([[ 7223,     0,     0,  ...,     0,    66,     0],
        [    2,    16,     0,  ...,     0,  5140,    10],
        [ 1229,   879,     0,  ...,     0,     0,     0],
        ...,
        [    4,    36,     0,  ...,     0,     2,     0],
        [  396, 10283,   151,  ...,     0,     0,     0],
        [    0,     2,     1,  ...,     0,     5,    11]], dtype=torch.int32)

In [7]:
# Convert counts to probabilities
bigram_probs = bigram_counts.float()
bigram_probs = bigram_probs / bigram_probs.sum(dim=1, keepdim=True)


In [8]:
# Check probability sum of one row
bigram_probs[stoi['a']].sum()


tensor(1.)

In [9]:
import random

def generate_text(start_char, length=500):
    current_char = start_char
    output = current_char

    for _ in range(length):
        i = stoi[current_char]
        probs = bigram_probs[i]

        # Sample next character
        next_index = torch.multinomial(probs, num_samples=1).item()
        next_char = itos[next_index]

        output += next_char
        current_char = next_char

    return output


In [10]:
# Start with a random character
start_char = random.choice(chars)

generated_text = generate_text(start_char, length=600)
print(generated_text)


enen ind s, d wndobrelly s ir laror he le

BRI we qu hinthan ok m wheliry!
AUMy cail s p semprker'd menoulotha m, t mave fffrakelllouegurrdwichefo mmat oe cll we te g bler oominene s yofreasuted cthikso:

Tht VISerdonte ine:
D witt Poppe s y pecr fe f manty he-

f kimale,
Peruasse't m, rveld

Malleanoway he wo havo and:
NGavelind d tern bjurd huts n orld bldouthis trast meher betang serr Juces my memeitil;
Buthaletf CI VI s f s.
cirin h


ONIZo merantrik, t
Whole animofe Doullfry. dorm cor 'sangrdowesullof
I sthat lometousts, her.
GLome; w h RI wameangma othes; s'TI chus l eacance Tist, tourot 
