In [38]:
import numpy as np
import matplotlib.pyplot as plt

In [39]:
!wget !wget https://www.cs.toronto.edu/~lczhang/413/raw_sentences.txt

--2024-02-10 04:14:10--  http://!wget/
Resolving !wget (!wget)... failed: Name or service not known.
wget: unable to resolve host address ‘!wget’
--2024-02-10 04:14:10--  https://www.cs.toronto.edu/~lczhang/413/raw_sentences.txt
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2955731 (2.8M) [text/plain]
Saving to: ‘raw_sentences.txt.1’


2024-02-10 04:14:13 (1.91 MB/s) - ‘raw_sentences.txt.1’ saved [2955731/2955731]

FINISHED --2024-02-10 04:14:13--
Total wall clock time: 2.9s
Downloaded: 1 files, 2.8M in 1.5s (1.91 MB/s)


In [40]:
sentences = []
for line in open('./raw_sentences.txt', 'r'):

      words = line.split()
      sentence = [word.lower() for word in words]
      sentences.append(sentence)

In [41]:
print("Number of sentences:", len(sentences))
vocab = set([word for sentence in sentences for word in sentence])
print("Size of vocabulary:", len(vocab))

Number of sentences: 97162
Size of vocabulary: 250


In [42]:
sentence_lengths = [len(sentence) for sentence in sentences]
print("Min length", np.min(sentence_lengths))
print("Average length", np.mean(sentence_lengths))
print("Max length", np.max(sentence_lengths))

Min length 4
Average length 7.792881990901793
Max length 43


In [43]:
from random import shuffle

shuffle(sentences)

train_data = sentences[:int(0.8 * len(sentences))]
val_data = sentences[int(0.8 * len(sentences)):int(0.9 * len(sentences))]
test_data = sentences[int(0.9 * len(sentences)):]

print(len(train_data), len(val_data), len(test_data))

77729 9716 9717


In [44]:
from collections import Counter

word_count = Counter([w for sentence in sentences for w in sentence])
print(word_count.most_common(1))

[('.', 80974)]


In [45]:
# create vocab dictionary
vocab = set([w for sentence in sentences for w in sentence])
word_index = {w:i for i, w in enumerate(vocab)}

In [46]:
# convert words to indices

def words_to_indices(sentences):

  return [[word_index[word] for word in sentence] for sentence in sentences]

def generate_4grams(indices):

  four_grams = []
  for sentence in indices:
    for i in range(len(sentence) - 4):
      four_grams.append(sentence[i:i+4])

  return four_grams

def compile_data(sentences):
  indices = words_to_indices(sentences)
  four_grams = generate_4grams(indices)
  return np.array(four_grams)

grams = compile_data(train_data)
print(grams.shape)

(294501, 4)


In [47]:
# one hot encoding the input

def onehot_encoder(four_grams, vocab_size):
  I = np.eye(vocab_size)
  return I[four_grams]

print(onehot_encoder(grams, len(vocab)).shape)

(294501, 4, 250)


In [48]:
def get_batch(data, range_min, range_max, onehot=True):

  X = data[range_min:range_max, :3]
  t = data[range_min:range_max, 3]

  X = onehot_encoder(X, len(vocab))

  X = X.reshape(-1, 3 * len(vocab))

  if onehot:
    t = onehot_encoder(t, len(vocab))
    t = t.reshape(-1, len(vocab))

  return X, t

# test the data out

grams = compile_data(train_data)
X, t = get_batch(grams, 0, 10, onehot=False)
print(X.shape, t.shape)

# check if indices are at the right position
for i in range(3):
  index_pos = grams[0][i]
  print(X[0][index_pos + ((len(vocab) * i) - 1)]) # should be 0
  print(X[0][index_pos + (len(vocab) * i)]) # should 1
  print(X[0][index_pos + ((len(vocab) * i) + 1)]) # should be 0

(10, 750) (10,)
0.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0


In [49]:
def accuracy(model, data, batch=100, max_N=10000):
  num_correct = 0
  num_pred = 0

  for i in range(0, data.shape[0], batch):
    X, t = get_batch(data, i, i+1, onehot=False)
    z = model.forward(X)
    pred = np.argmax(z, axis=1)

    num_correct += np.sum(pred == t)
    num_pred += t.shape[0]

    if num_pred >= max_N:
      break

  return num_correct / num_pred

In [50]:
def softmax(x):

  x = x.T
  exps = np.exp(x - x.max(axis=0))
  probs = exps / np.sum(exps, axis=0)
  return probs.T

In [51]:
class NNModel:

  def __init__(self, vocab_size, emb_size, num_hidden):

    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.num_hidden = num_hidden

    self.Ww = np.zeros([vocab_size, emb_size])
    # self.bw = np.zeros([emb_size])

    self.W1 = np.zeros([emb_size * 3, num_hidden])
    self.b1 = np.zeros([num_hidden])

    self.W2 = np.zeros([num_hidden, vocab_size])
    self.b2 = np.zeros([vocab_size])

    self.initialize_weights()

    self.cleanup()

  def initialize_weights(self):

    self.Ww = np.random.normal(0, 2/(self.vocab_size), self.Ww.shape)

    self.W1 = np.random.normal(0, 2/(3*self.emb_size), self.W1.shape)
    self.b1 = np.random.normal(0, 2/(3*self.emb_size), self.b1.shape)

    self.W2 = np.random.normal(0, 2/(self.num_hidden), self.W2.shape)
    self.b2 = np.random.normal(0, 2/(self.num_hidden), self.b2.shape)

  def cleanup(self):

    self.N = None
    self.xa = None
    self.xb = None
    self.xc = None
    self.va = None
    self.vb = None
    self.vc = None
    self.v = None
    self.m = None
    self.h = None
    self.z = None
    self.y = None

    self.z_bar = None
    self.W2_bar = None
    self.b2_bar = None
    self.h_bar = None
    self.m_bar = None
    self.W1_bar = None
    self.b1_bar = None
    self.v_bar = None
    self.va_bar = None
    self.vb_bar = None
    self.vc_bar = None
    self.Ww_bar = None


  def forward(self, X):

    return do_forward_pass(self, X)

  def backward(self, t):

    return do_backward_pass(self, t)

  def loss(self, t):

    # Cross Entropy Loss
    pred = np.sum(-t * np.log(self.y)) / t.shape[0]
    return pred

  def update(self, alpha):

    self.Ww = self.Ww - alpha * self.Ww_bar
    self.W1 = self.W1 - alpha * self.W1_bar
    self.b1 = self.b1 - alpha * self.b1_bar
    self.W2 = self.W2 - alpha * self.W2_bar
    self.b2 = self.b2 - alpha * self.b2_bar

In [52]:
def do_forward_pass(model, X):

  model.N = X.shape[0]
  model.X = X

  model.xa = X[:, 0:model.vocab_size]
  model.xb = X[:, model.vocab_size:2*model.vocab_size]
  model.xc = X[:, 2*model.vocab_size:]

  # compute embeddings
  model.va = np.matmul(model.xa, model.Ww)
  model.vb = np.matmul(model.xb, model.Ww)
  model.vc = np.matmul(model.xc, model.Ww)
  model.v = np.concatenate([model.va, model.vb, model.vc], axis=1)

  # compute pre-hidden layer input
  model.m = np.matmul(model.v, model.W1) + model.b1

  # compute post hidden layer - ReLU activation
  model.h = np.maximum(0, model.m)

  # compute logits
  model.z = np.matmul(model.h, model.W2) + model.b2

  # compute class probabilities
  model.y = softmax(model.z)

  return model.z

In [53]:
x, t = get_batch(grams, 0, 10)
model = NNModel(len(vocab), 150, 100)
y = model.forward(x)

print(model.va.shape, model.vb.shape, model.vc.shape)
print(model.v.shape)
print(model.z.shape)
print(model.z.shape)

(10, 150) (10, 150) (10, 150)
(10, 450)
(10, 250)
(10, 250)


In [54]:
print(accuracy(model, grams))

0.0010183299389002036
