In [5]:
!uv venv

Using CPython [36m3.10.17[39m
Creating virtual environment at: [36m.venv[39m
Activate with: [32msource .venv/bin/activate[39m


In [6]:
!source .venv/bin/activate
!uv pip install numpy

[2mUsing Python 3.12.9 environment at: /Users/ElliotPhua/miniconda3[0m
[2mAudited [1m1 package[0m [2min 5ms[0m[0m


In [18]:
from typing import List
import numpy as np
import re

corpus = "" # corpus

def preprocess_corpus(corpus: str) -> tuple[dict[str, int], dict[int, str], np.ndarray]:
    """
    Args:
        corpus (str)
    Returns:
        dict[str, int]: ind_dict[k=word, v=index]
        dict[int, str]: str_dict[k=index, v=word]
        np.ndarray: an ndarray of word indices corresponding to the input corpus (ground truth)
    """
    tokens = re.findall(r"\b\w+\b", corpus)
    ind_dict = {} # k=word, v=ind
    str_dict = {} # k=ind, v=word
    count = 0
    for word in tokens:
        if word not in ind_dict:
            ind_dict[word] = count
            str_dict[count] = word
            count += 1
    
    data_ind = np.array([ind_dict[w] for w in tokens])
    
    return ind_dict, str_dict, data_ind

def make_sequences(data_ind: np.ndarray, seq_len:int=5) -> tuple[np.ndarray, np.ndarray]:
    X = np.array([data_ind[i:i+seq_len] for i in range(len(data_ind) - seq_len)])
    Y = np.array([data_ind[i+seq_len] for i in range(len(data_ind) - seq_len)])
    return X, Y


In [8]:
# Recurrent Neural Network
import numpy as np

def get_word_embedding(x_t:str, ind_dt:dict[str, int], w_e:np.ndarray) -> np.ndarray:
    '''
    Convert word into vector embedding.
    Args:
        x_t (str): The target word.
        ind_dt (dict): Dictionary of vocab, where key is the word and value is its row index in the weight embedding w_e.
        w_e (np.ndarray): A 2d weight embedding of shape |V| x d, where V is the number of words in vocab and d is the embedding space.
    Returns:
        np.ndarray: Returns a np.array of shape 1 x d, denoting the vector embedding for the target word.
    '''
    row_ind = ind_dt[x_t]
    return w_e[row_ind, :]

def get_h_t(e_t:np.ndarray, prev_h:np.ndarray, w_hh:np.ndarray, w_xh:np.ndarray, b_h:np.ndarray) -> np.ndarray:
    '''
    h_t <- f(x_t, h_t-1)
    Get current state memory, dependent on current word embedding e_t and state memory at previous time step prev_h, and using the weight embedding w_hh.
    Args:
        e_t (np.ndarray): The vector embedding for target word with shape 1 x d, where d is the embedding space.
        prev_h (np.ndarray): The state memory at previous time step t-1, with shape H x 1
        w_hh (np.ndarray): weight embedding for state memory, shape H x H
        w_xh (np.ndarray): weight embedding to transform target word embedding into space of state memory embedding, shape H x d
        b_h (np.ndarray): bias with shape H x 1
    Returns:
        np.ndarray: the current state memory h_t, shape H x 1
    '''
    h_t = np.tanh(w_xh.T@e_t + w_hh.T@prev_h + b_h) # tanh prevents exploding gradients and gives hidden state non-linearity
    return h_t

def predict_z_t(h_t:np.ndarray, w_hy:np.ndarray, b_y:np.ndarray) -> np.ndarray:
    '''
    Get the output raw score for all candidate output words
    Args:
        h_t (np.ndarray): the current state memory embedding, shape H x 1
        w_hy (np.ndarray): the output score embedding, shape H x |v|
        b_y (np.ndarray): the bias for output score, shape |v| x 1
    Returns:
        np.ndarray: the raw score vector for all candidate words, shape |v| x 1
    '''
    return w_hy.T@h_t + b_y

def softmax(z_t: np.ndarray) -> np.ndarray:
    """
    softmax(z_i) = exp(z_i - C)/sum[j in T](exp(z_j - C)), numerical stability
    """
    exp_z = np.exp(z_t - np.max(z_t))
    return exp_z/np.sum(exp_z, axis=1, keepdims=True)
    
def get_most_likely_output_word(p_t:np.ndarray, str_dict: dict[int, str]) -> str:
    index_of_max_p = np.argmax(p_t)
    return str_dict[int(index_of_max_p)] 
    

In [None]:
# preprocessing data


In [None]:
# backpropagation through time (BPTT)


In [9]:
!uv pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!uv pip install scikit-learn pandas

[2mUsing Python 3.12.9 environment at: /Users/ElliotPhua/miniconda3[0m
[2mAudited [1m3 packages[0m [2min 5ms[0m[0m
[2mUsing Python 3.12.9 environment at: /Users/ElliotPhua/miniconda3[0m
[2mAudited [1m2 packages[0m [2min 6ms[0m[0m


In [10]:
# Using pytorch
import torch

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else: print("MPS device not found.")

tensor([1.], device='mps:0')


In [16]:
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import pandas as pd

fp = "data/text8"

with open(fp, 'r') as f:
    corpus = f.read().lower()

ind_dict, str_dict, data_indices = preprocess_corpus(corpus)

X, Y = make_sequences(data_indices, seq_len=5)
print(X, Y)

[[  0   1   2   3   4]
 [  1   2   3   4   5]
 [  2   3   4   5   6]
 ...
 [275 197 254 196 275]
 [197 254 196 275 197]
 [254 196 275 197 446]] [  5   6   7 ... 197 446 492]


In [17]:

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42, shuffle=True)


In [None]:
# initialise input embedding, w_xh, w_hh, w_hy

[[  285     5 15706   149 58437]
 [  196   275   197   197 31467]
 [   26    15   604   592   118]
 ...
 [   15  9813   361  4519   315]
 [  459  4106   106  2449   218]
 [77935  9473  5816    26 77935]] [  2222    167    457 ...      3 206857  31521]
