In [30]:
import math
import torch
import torch.nn.functional as F

In [64]:
words = open('Dataset/Test Español/Test.txt', 'r').read().split()
len(words)

1896

In [65]:
uwords = set(words)
print(f'Vocabulary size: {len(uwords)}')

Vocabulary size: 924


In [66]:
if len(uwords) > 1000:
    print("Very large vocabulary detected. Consider limiting vocabulary size.")
    from collections import Counter
    word_counts = Counter(words)
    top_words = [word for word, _ in word_counts.most_common(1000)]
    uwords = set(top_words)
    print(f'Limited vocabulary to top 1000 words')

In [67]:
wtoi = {s:i for i,s in enumerate(uwords)}
itow = {i:s for s,i in wtoi.items()}

In [78]:
# create the dataset
xs, ys = [], []
for w1, w2 in zip(words, words[1:]):
    if w1 in uwords and w2 in uwords:
        ix1 = wtoi[w1]
        ix2 = wtoi[w2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)
len(ys)

number of examples:  1895


1895

In [81]:
# initialize the 'network'
g = torch.Generator().manual_seed(10022004)
W = torch.randn((len(uwords), len(uwords)), generator=g, requires_grad=True, dtype=torch.float16)
W.dtype

RuntimeError: "normal_kernel_cpu" not implemented for 'Bool'

In [77]:
# gradient descent
for k in range(100):

    # forward pass
    xenc = F.one_hot(xs, num_classes=len(uwords)).float() # input to the network: one-hot encoding
    logits = xenc @ W # predict log-counts
    #counts = logits.exp() # counts, equivalent to N
    #probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    #loss = -probs[torch.arange(num), ys].log().mean()
    loss = torch.nn.functional.cross_entropy(logits, ys)
    if (k % 100 == 0):
        print("step: ", k, "loss: ", loss.item())

    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()

    # update
    with torch.no_grad():
        W -= 50 * W.grad

step:  0 loss:  7.32421875


KeyboardInterrupt: 

In [71]:
W

tensor([[-0.7056,  0.8052, -0.0105,  ..., -1.1589,  2.0847, -0.7403],
        [-0.0558,  0.1391, -0.5751,  ..., -0.2039,  0.7923,  0.6101],
        [-0.4650, -0.6915,  1.6357,  ..., -0.8117,  0.7727, -0.5670],
        ...,
        [-0.5403,  0.5932,  0.1426,  ..., -0.1567,  1.2153, -1.5757],
        [ 1.5530, -0.1820, -0.9679,  ...,  1.8583,  0.1253, -1.3439],
        [ 0.4030,  0.4487, -1.2789,  ...,  0.0491,  2.2162,  1.2107]],
       requires_grad=True)

In [97]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):

    out = ""
    ix = 0
    for j in range(20):

        xenc = F.one_hot(torch.tensor([ix]), num_classes=len(uwords)).float()
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True) # probabilities for next word

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out += " " + itow[ix]
    print(''.join(out))

 movía más. *Dripp* Algo cayó desde que ganar dinero fueron completamente negra, hubiera roto. Hay muchas personas moverse objeto adornada
 no está funcionando bien porque se hubiera nacido en varios aspectos. Si el juego. Entre los Rol mientras cumplieras es
 brilló en el oponente fuese collar jugadores. Las razas llegaría a sus DMMO-RPGs que DMMO-RPG no podía pedirle perdón por
 no estaba aferrándose a su alma mundos existencia que podía alegrarse porque se congeló en el hogar del jugador. Me
 no se sentaron. Uno llevaba una existencia que no que destacaba Era el último día que se ovación que lo


In [82]:
# COUNTING

In [181]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import glob
import os
from collections import defaultdict

folder_path = 'Dataset/English'  # Change this to your folder path

# Get all text files in the folder
file_paths = glob.glob(os.path.join(folder_path, '*.txt'))
print(f'Found {len(file_paths)} text files in {folder_path}')

# Read and combine all words from all files
all_words = []
for file_path in file_paths:
    try:
        print(f'Processing file: {file_path}')
        with open(file_path, 'r', encoding='utf-8') as file:
            words = file.read().split()
            all_words.extend(words)
    except Exception as e:
        print(f'Error processing {file_path}: {str(e)}')

print(f'Total words read: {len(all_words)}')

words = all_words

Found 4 text files in Dataset/English
Processing file: Dataset/English\Birds.txt
Processing file: Dataset/English\Encyclopedia.txt
Processing file: Dataset/English\House rats and mice.txt
Processing file: Dataset/English\Leviathan.txt
Total words read: 1589302


In [182]:
uwords = set(all_words)
size = len(uwords)

vocab_limit = 2000

if len(uwords) > vocab_limit:
    print(f"Very large vocabulary detected: {size} words. Consider limiting vocabulary size.")
    from collections import Counter
    word_counts = Counter(words)
    top_words = [word for word, _ in word_counts.most_common(vocab_limit)]
    uwords = set(top_words)
    print(f'Limited vocabulary to top {vocab_limit} words')

wtoi = {s:i for i,s in enumerate(uwords)}
itow = {i:s for s,i in wtoi.items()}

size = len(uwords)
size

Very large vocabulary detected: 154827 words. Consider limiting vocabulary size.
Limited vocabulary to top 2000 words


2000

In [216]:
N = torch.zeros(size, size, dtype = torch.uint8)

In [217]:
# getting the Bigrams
bigrams = 0
b = {}
for w1, w2 in zip(words, words[1:]):
    if w1 in uwords and w2 in uwords:
        ix1 = wtoi[w1]
        ix2 = wtoi[w2]

        if (N[ix1, ix2].item() < 255):
            N[ix1, ix2] += 1
            bigrams += 1
            if (bigrams % 10000 == 0):
                print(f"There are {bigrams} bigrams")
            bigram = (w1, w2)
            b[bigram] = b.get(bigram, 0) + 1

There are 10000 bigrams
There are 20000 bigrams
There are 30000 bigrams
There are 40000 bigrams
There are 50000 bigrams
There are 60000 bigrams
There are 70000 bigrams
There are 80000 bigrams
There are 90000 bigrams
There are 100000 bigrams
There are 110000 bigrams
There are 120000 bigrams
There are 130000 bigrams
There are 140000 bigrams
There are 150000 bigrams
There are 160000 bigrams
There are 170000 bigrams
There are 180000 bigrams
There are 190000 bigrams
There are 200000 bigrams
There are 210000 bigrams
There are 220000 bigrams
There are 230000 bigrams
There are 240000 bigrams
There are 250000 bigrams
There are 260000 bigrams
There are 270000 bigrams
There are 280000 bigrams
There are 290000 bigrams
There are 300000 bigrams
There are 310000 bigrams
There are 320000 bigrams
There are 330000 bigrams
There are 340000 bigrams
There are 350000 bigrams
There are 360000 bigrams
There are 370000 bigrams
There are 380000 bigrams
There are 390000 bigrams
There are 400000 bigrams
There are

In [218]:
sorted(b.items(), key = lambda kv: -kv[1])

[(('in', 'the'), 255),
 (('have', 'been'), 255),
 (('for', 'a'), 255),
 (('of', 'the'), 255),
 (('the', 'first'), 255),
 (('as', 'a'), 255),
 (('It', 'is'), 255),
 (('one', 'of'), 255),
 (('the', 'most'), 255),
 (('in', 'this'), 255),
 (('and', 'it'), 255),
 (('on', 'the'), 255),
 (('to', 'the'), 255),
 (('by', 'the'), 255),
 (('the', 'United'), 255),
 (('of', 'a'), 255),
 (('among', 'the'), 255),
 (('a', 'few'), 255),
 (('with', 'the'), 255),
 (('is', 'the'), 255),
 (('and', 'other'), 255),
 (('that', 'is'), 255),
 (('but', 'in'), 255),
 (('with', 'a'), 255),
 (('are', 'the'), 255),
 (('form', 'of'), 255),
 (('of', 'these'), 255),
 (('can', 'be'), 255),
 (('means', 'of'), 255),
 (('in', 'a'), 255),
 (('from', 'the'), 255),
 (('number', 'of'), 255),
 (('that', 'the'), 255),
 (('to', 'be'), 255),
 (('known', 'as'), 255),
 (('it', 'is'), 255),
 (('and', 'in'), 255),
 (('is', 'a'), 255),
 (('of', 'an'), 255),
 (('for', 'the'), 255),
 (('in', 'their'), 255),
 (('is', 'not'), 255),
 (('in',

In [212]:
wtoi['the']

650

In [219]:
N[1843, 650]

tensor(255, dtype=torch.uint8)

In [221]:
P = N / N.sum(dim = 1, keepdim = True)

In [222]:
def count_loss(input_list, verbose = False):
    log_likelihood = 0.0
    n = 0
    for w1, w2 in zip(words, words[1:]):
        if w1 in uwords and w2 in uwords:
            ix1 = wtoi[w1]
            ix2 = wtoi[w2]

            prob = P[ix1, ix2]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1

    # higher the log likelihood (closer to 0) is better
    print(f"log Likelihood: {log_likelihood}")

    # but in loss function lower is better, so we negate it
    nll = -log_likelihood
    print(f"Negative log likelihood: {nll}")

    # normalize it
    print(f"Normalized Negative log Likelihood: {(nll / n)}") # we need to minimize this
    
    
print("Training Loss")
count_loss(words)

g = torch.Generator().manual_seed(2147483647)

# Sampling
sample = ""
for i in range(10):

    ix = 0
    for j in range(20):
        p = P[ix]

        ix = torch.multinomial(p, 1, replacement=True).item()
        sample += " " + itow[ix]


    
print(sample)
print("Sampled words Loss")
count_loss(sample)

Training Loss
log Likelihood: -2868851.75
Negative log likelihood: 2868851.75
Normalized Negative log Likelihood: 4.003024578094482
 have shown that has the name has changed the outer boundary was obliged to carry the coast, where every little railway from nearly every fresh attack on this, with reference to pay what his children were mainly by which had resistance it to take any part of our work in other intellectual than his personal history of September of metal war, afterwards visited the books (see further extended from those places of life by sea was raised from a school and afterwards he returned to represent nothing to which he took it by them seems to Germany, in proportion of of that it was reduced charge a place at his town and all of local authorities are remarkable for her tendency to matters known as being the cell in time at present time has an important is the knowledge of that is, the earl of these can he is generally termed an area of beauty is never have derived from