# Assignment 2
## Section 2

In [18]:
import torch
import torch.nn.functional as F

with open('anne.txt', 'r') as file:
  content = file.read().lower()
  words = content.split()

words = [word for word in words if word.isalpha() and 'æ' not in word]

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s, i in stoi.items()}

N = torch.zeros((27,27), dtype=torch.int32)

In [19]:
# Step-1: Create the dataset
inputs, labels = [], []
for w in words:
  str = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(str, str[1:]):
    index1 = stoi[ch1]
    index2 = stoi[ch2]
    inputs.append(index1)
    labels.append(index2)

inputs = torch.tensor(inputs)
labels = torch.tensor(labels)
num = inputs.nelement()
print('Number of examples: ', num)

Number of examples:  435187


In [24]:
# Negative log likelihood

# Intitialize the network
g = torch.Generator().manual_seed(1234567890)
W1 = torch.randn((27,27), generator=g, requires_grad=True)
# Step-2: Gradient descent
for k in range(10):

  # Forward pass
  inputs_encoded = F.one_hot(inputs, num_classes=27).float()
  logits = inputs_encoded @ W1
  counts = logits.exp() # Counts
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(inputs.nelement()), labels].log().mean()
  print(loss.item())

  # Backward pass
  W1.grad = None # Set the gradients to zero
  loss.backward()

  # Update
  W1.data += -50 * W1.grad

3.6767637729644775
3.2435972690582275
3.0112698078155518
2.8654487133026123
2.766885280609131
2.695883274078369
2.6416678428649902
2.5989911556243896
2.564650058746338
2.536466121673584


In [23]:
# Cross Entropy Method 1

# Intitialize the network
g = torch.Generator().manual_seed(1234567890)
W1 = torch.randn((27,27), generator=g, requires_grad=True)
# Step-2: Gradient descent
for k in range(10):
  # Forward pass
  inputs_encoded = F.one_hot(inputs, num_classes=27).float()
  logits = inputs_encoded @ W1
  loss = F.cross_entropy(logits, labels)
  print(loss.item())

  # Backward pass
  W1.grad = None # Set the gradients to zero
  loss.backward()

  # Update
  W1.data += -50 * W1.grad

3.6767637729644775
3.2435972690582275
3.0112698078155518
2.865447998046875
2.766885280609131
2.6958835124969482
2.6416678428649902
2.5989913940429688
2.564650058746338
2.536465883255005


In [25]:
# Cross Entropy Method 2

# Intitialize the network
g = torch.Generator().manual_seed(1234567890)
W1 = torch.randn((27,27), generator=g, requires_grad=True)
# Step-2: Gradient descent
for k in range(10):
  # Forward pass
  inputs_encoded = F.one_hot(inputs, num_classes=27).float()
  logits = inputs_encoded @ W1
  yhat  = torch.softmax(logits, dim=-1)
  l2    = -yhat.log()[torch.arange(num), labels]
  loss = F.cross_entropy(logits, labels)
  print(loss.item())

  # Backward pass
  W1.grad = None # Set the gradients to zero
  loss.backward()

  # Update
  W1.data += -50 * W1.grad

3.6767637729644775
3.2435972690582275
3.0112698078155518
2.865447998046875
2.766885280609131
2.6958835124969482
2.6416678428649902
2.5989913940429688
2.564650058746338
2.536465883255005


## Section 3

On running the below code, with the final results it can be observed that the words generated seem closer to realistic words or extensions of realistic words

In [2]:
import torch
import torch.nn.functional as F

# Open the file in read mode
with open('anne.txt', 'r') as file:
    content = file.read().lower()
    words = content.split()

words = [word for word in words if word.isalpha() and 'æ' not in word]

# Create a sorted list of unique characters
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}  # Use 1-based indexing for characters
stoi['.'] = 0  # Use '.' as padding or end-of-sequence token
itos = {i: s for s, i in stoi.items()}

# Step-1: Create the trigram dataset
inputs, labels = [], []
for w in words:
  word = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
    # Add start and end tokens to the word
        index1 = stoi[ch1]
        index2 = stoi[ch2]
        index3 = stoi[ch3]
        inputs.append([index1, index2])
        labels.append(index3)
inputs = torch.tensor(inputs)
labels = torch.tensor(labels)


num = len(inputs)
print('Number of examples:', num)

Number of examples: 435187


In [13]:
# Initialize the network
g = torch.Generator().manual_seed(1234567890)
W1 = torch.randn((27,27), generator=g,requires_grad=True)
W2 = torch.randn((27,27), generator=g,requires_grad=True)

# Step-2: Gradient descent
for epoch in range(15):
    # Forward pass
    inputs_encoded = F.one_hot(inputs, num_classes=27).float()
    logits = torch.add(inputs_encoded[:,0] @ W1, inputs_encoded[:,1]@W2)
    counts = logits.exp() # Counts, equivalent to N
    prob = counts / counts.sum(1, keepdims=True) # Probabilities for next character
    loss = -prob[torch.arange(num), labels].log().mean()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

    # Backward pass
    W1.grad = None # Set the gradients to zero
    W2.grad = None
    loss.backward()

    # Update
    W1.data += -45 * W1.grad
    W2.data += -45 * W2.grad

Epoch 1, Loss: 4.1365885734558105
Epoch 2, Loss: 3.3062171936035156
Epoch 3, Loss: 2.949286937713623
Epoch 4, Loss: 2.769080400466919
Epoch 5, Loss: 2.6688058376312256
Epoch 6, Loss: 2.633512258529663
Epoch 7, Loss: 2.548745632171631
Epoch 8, Loss: 2.5178945064544678
Epoch 9, Loss: 2.4838078022003174
Epoch 10, Loss: 2.4748165607452393
Epoch 11, Loss: 2.4124271869659424
Epoch 12, Loss: 2.394338369369507
Epoch 13, Loss: 2.3933489322662354
Epoch 14, Loss: 2.401322841644287
Epoch 15, Loss: 2.3382577896118164


In [14]:
# Step-3: Generate tokens using the trained trigram model
g = torch.Generator().manual_seed(1234567890)

torch.manual_seed(1234567890)
for _ in range(10):
    out = []
    index1, index2 = 0, 0  # Start with '..' as the initial characters
    while True:
        inputs_encoded = F.one_hot(torch.tensor([index1, index2]), num_classes=27).float()
        logits = torch.add(inputs_encoded[0,:] @ W1, inputs_encoded[1,:] @ W2)
        counts = logits.exp() # Counts, equivalent to N
        p = counts / counts.sum(0, keepdims=True) # Probabilities for next character
        index = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[index])
        if index == 0:
          break
        index1 = index2
        index2 = index

    print(''.join(out))

iome.
to.
te.
ill.
dnndeat.
zpghemrwhan.
oo.
ati.
dollen.
moor.
