# NW AAC
Next-word prediction for AAC board using Gensim and Word2Vec.

Uses `contractions` to remove contractions from data.

Uses Python 3.11.x and the libraries listed in the following imports.

In [1]:
# imports

# dealing with initial data
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt

# word2vec implementation
import gensim
from gensim.models import Word2Vec

# data preprocessing
import re
import contractions

# N-gram implementation dependency
from collections import Counter

# LSTM dependencies
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# read data, convert to lists
df = pd.read_csv("Conversation.csv")

qlist = df["question"].tolist()
alist = df["answer"].tolist()
print(qlist[:5])
print(len(qlist), len(alist))

df.head()

['hi, how are you doing?', "i'm fine. how about yourself?", "i'm pretty good. thanks for asking.", 'no problem. so how have you been?', "i've been great. what about you?"]
3725 3725


Unnamed: 0,num,question,answer
0,0,"hi, how are you doing?",i'm fine. how about yourself?
1,1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,3,no problem. so how have you been?,i've been great. what about you?
4,4,i've been great. what about you?,i've been good. i'm in school right now.


In [3]:
# remove contractions
qlist = [contractions.fix(l) for l in qlist]
alist = [contractions.fix(l) for l in alist]

print(len(qlist), len(alist))

print(qlist[765:772])
print(alist[765:772])

# does not remove possessives, though
print(qlist[785:792])
print(alist[785:792])

3725 3725
['poodles bark a lot.', 'they sure do.', 'they bark at everything.', 'they never shut up.', 'why did you get a poodle?', "it is my mom's dog.", 'so she likes poodles.']
['they sure do.', 'they bark at everything.', 'they never shut up.', 'why did you get a poodle?', "it is my mom's dog.", 'so she likes poodles.', 'she says they are good watchdogs.']
['we have not been in a while.', 'we have not been in a month.', 'the last time we went, you almost drowned.', 'no, i did not.', 'then why did the lifeguard dive into the water?', 'i think he wanted to cool off.', 'he swam right up to you.']
['we have not been in a month.', 'the last time we went, you almost drowned.', 'no, i did not.', 'then why did the lifeguard dive into the water?', 'i think he wanted to cool off.', 'he swam right up to you.', 'and then he turned right around.']


In [4]:
# remove punctuation, numbers, signs. except for remaining apostrophes
qlist = [re.sub(r"[^a-zA-Z\s']", "", l) for l in qlist]
alist = [re.sub(r"[^a-zA-Z\s']", "", l) for l in alist]

print(len(qlist), len(alist))

print(qlist[765:772])
print(alist[765:772])

print(qlist[785:792])
print(alist[785:792])

3725 3725
['poodles bark a lot', 'they sure do', 'they bark at everything', 'they never shut up', 'why did you get a poodle', "it is my mom's dog", 'so she likes poodles']
['they sure do', 'they bark at everything', 'they never shut up', 'why did you get a poodle', "it is my mom's dog", 'so she likes poodles', 'she says they are good watchdogs']
['we have not been in a while', 'we have not been in a month', 'the last time we went you almost drowned', 'no i did not', 'then why did the lifeguard dive into the water', 'i think he wanted to cool off', 'he swam right up to you']
['we have not been in a month', 'the last time we went you almost drowned', 'no i did not', 'then why did the lifeguard dive into the water', 'i think he wanted to cool off', 'he swam right up to you', 'and then he turned right around']


In [5]:
# combine to compatible sentences nested lists (each word becomes indiv element of parent list)
sentences = list(map(lambda x : x.split(), qlist))
sentences.extend(map(lambda x : x.split(), alist))

print(len(sentences))
sentences[:5]

7450


[['hi', 'how', 'are', 'you', 'doing'],
 ['i', 'am', 'fine', 'how', 'about', 'yourself'],
 ['i', 'am', 'pretty', 'good', 'thanks', 'for', 'asking'],
 ['no', 'problem', 'so', 'how', 'have', 'you', 'been'],
 ['i', 'have', 'been', 'great', 'what', 'about', 'you']]

In [6]:
# put into W2V model & save
model = Word2Vec(sentences, vector_size=300, window=5, workers=4, epochs=10, min_count=5)
model.save("conv.model")

pretrained_weights = model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(vocab_size, embedding_size)

1003 300


In [7]:
# average context's embed vectors and find most similar vector for next word
class WordPredictor:
    def __init__(self, sentences: list[list[str]], method: str = "word2vec", max_ngram: int = 5):
        """
        Initializes the predictor with training sentences

        Args:
            sentences: List of lists, inevitably loaded to W2V model
            method: Prediction model specification ("word2vec", "ngram", or "lstm")
            max_ngram: Maximum n-gram lookback size
        
        Methods:
            build_vocabulary: creates a word-to-index mapping from training sentences
            build_ngram_models: constructs multiple N-gram models with different context windows
            predict_next_word: predicts next word given current context
            prepare_lstm_data: prepares training data for LSTM model
            build_lstm_model: initializes and trains LSTM model
            predict_lstm: predicts next word using trained LSTM model
        """
        self.method = method
        self.sentences = sentences
        self.vocab = self.build_vocabulary()

        if method == "word2vec":
            self.model = Word2Vec(
                sentences, vector_size=100, window=5, min_count=1, workers=4
            )

        elif method == "lstm": # TODO
            self.prepare_lstm_data()
            self.build_lstm_model()

        else:
            self.max_ngram = max_ngram
            self.build_ngram_models()
    
    def build_vocabulary(self):
        """ Build vocabulary from sentences """

        words = []
        for sentence in sentences:
            words.extend(sentence)

        unique_words = set(words)

        vocab = {word: idx for idx, word in enumerate(unique_words)}
        
        return vocab

    def build_ngram_models(self):
        """
        Build multiple N-gram models from the training sentences
        Implements backoff in case phrase not found in training data
        """
        self.ngram_models = {}

        for n in range(self.max_ngram, 1, -1):
            self.ngram_models[n] = {} # access each model using its context size

            for sentence in self.sentences:
                for i in range(len(sentence) - n + 1): # iterate through all context windows of length `n`
                    prefix = tuple(sentence[i:i+n-1])
                    next_word = sentence[i+n-1]
                    if prefix not in self.ngram_models[n]:
                        self.ngram_models[n][prefix] = Counter()
                    self.ngram_models[n][prefix][next_word] += 1


    def predict_next_word(self, context, top_n=5):
        """
        Predict the next word given a context

        Args:
            context: List of words
            top_n: # of predictions to return
        
        Returns:
            List of (word, score) tuples
        """

        if self.method == "word2vec":
            context_vectors = [self.model.wv[word] for word in context]
            avg_vector = np.mean(context_vectors, axis=0)

            similar_words = self.model.wv.similar_by_vector(avg_vector, topn=top_n)

            return similar_words

        elif self.method == "lstm":
            return self.predict_lstm(context, top_n) # TODO
        
        else: # method = N-gram w/ backoff
            for n in range(self.max_ngram, 1, -1): # find the largest N-gram that works
                if len(context) >= n - 1:
                    prefix = tuple(context[-(n-1):])
                    if prefix in self.ngram_models[n]:
                        predictions = self.ngram_models[n][prefix].most_common(top_n)
                        total = sum(self.ngram_models[n][prefix].values())
                        return [(word, count / total) for word, count in predictions]
            
            # worst case just return most common words (unigram)
            if 2 in self.ngram_models:
                all_words = Counter()
                for ngram_dict in self.ngram_models[2].values():
                    all_words.update(ngram_dict)
                total = sum(all_words.values())
                return [(word, count / total) for word, count in all_words.most_common(top_n)]
            
            return [] # NULLGRAM (empty training dataset)
    
    def prepare_lstm_data(self): # TODO
        """
        Prepare data for LSTM training
        """

        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.vocab_size = len(self.vocab)

        self.sequence_length = 5 # context window size
        self.X = []
        self.y = []

        for sentence in sentences:
            indices = [self.word_to_idx[word] for word in sentence]
            for i in range(len(indices) - self.sequence_length):
                self.X.append(indices[i:i+self.sequence_length])
                self.y.append(indices[i+self.sequence_length])


In [8]:
wp = WordPredictor(sentences, method="ngram")
wp.prepare_lstm_data()
wp.vocab_size
max(wp.word_to_idx.values())

2417

In [9]:
response = wp.predict_next_word(["i", "feel", "like", "i", "want", "to"])
print(response)

wl = ["he", "works"]

for i in range(5):
    print(" ".join(wl))
    n = wp.predict_next_word(wl)[0][0]
    wl.append(n)

[('go', 0.22727272727272727), ('get', 0.13636363636363635), ('play', 0.09090909090909091), ('be', 0.06818181818181818), ('do', 0.045454545454545456)]
he works
he works only
he works only in
he works only in canada
he works only in canada and


In [10]:
class LSTM(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int = 100, hidden_dim: int = 128, num_layers: int = 2, dropout: float = 0.2):
        """
        Initialize LSTM model for word prediction

        Args:
            vocab_size: Size of vocabulary
            embedding_dim: Number of word embeddings
            hidden_dim: Dimension of LSTM hidden layer
            num_layers: Number of LSTM layers
            dropout: Dropout probability
        """

        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        """
        embedding layer = vocab_size * embedding_dim
        * Takes word indices as input (integers from 0 to `vocab_size` - 1)
        * Converts words to vector of size `embedding_dim` to be learned during training
        """
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=(dropout if num_layers > 1 else 0)
        )
        """
        LSTM layer
        * Input: sequence of embedding vectors
        * Internal structure w/ three gates:
          1. Forget gate: which info should be discarded from cell state
          2. Input gate: which new information to store
          3. Output gate: which cell state parts to output
        """

        self.dropout = nn.Dropout(dropout) # NOTE TO SELF: aardvark abandon 2-20 in word2vec -- figure out its terminology for easy remembrance and comparison

        # output fully-connected layer
        self.fc = nn.Linear(hidden_dim, vocab_size)
        """
        Final hidden state from LSTM
        * Projects to vocab_size dimensions, representing score for each word
        """

        # initialize weights
        self.init_weights()
    
    def init_weights(self):
        """ Initialize model weights for better training """
        for name, param in self.named_parameters(): # iterator over parameters (weights & biases)
            if "weight" in name:
                nn.init.xavier_uniform_(param)
            elif "bias" in name:
                nn.init.zeros_(param)
    
    def init_hidden(self, batch_size, device):
        """ Initialize hidden and cell states """
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device))
    
    def forward(self, x: torch.Tensor, hidden: tuple = None):
        """
        Forward propagation

        Args:
            x: Input tensor of shape (batch_size, sequence_length)
            hidden: Initial hidden state and cell state
        
        Returns:
            output (torch.Tensor): Output tensor of shape (batch_size, vocab_size)
            hidden: Final hidden state and cell state
        """
        batch_size = x.size(0)

        if hidden is None:
            hidden = self.init_hidden(batch_size, x.device)

        # embed input: (batch_size, sequence_length) -> (batch_size, sequence_length, embedding)dim
        embedded = self.embedding(x)

        # forward pass
        lstm_out, hidden = self.lstm(embedded, hidden)

        # final output
        output = lstm_out[:, -1, :] # take last timestep output
        output = self.dropout(output)
        output = self.fc(output)

        return output, hidden

In [11]:
l = LSTM(5, 10, 20, 2, 0.2)
u = l.init_hidden(50, "cpu")
v = l.init_weights()

u[0].size()
print(list(l.named_parameters()))

[('embedding.weight', Parameter containing:
tensor([[-0.1854,  0.4026, -0.2340, -0.5288,  0.2336, -0.2268,  0.0333, -0.3519,
          0.3073, -0.2234],
        [-0.0628, -0.2259, -0.6137,  0.2654, -0.5434, -0.3132, -0.3817, -0.6215,
          0.2641,  0.1673],
        [ 0.5487,  0.3111, -0.6215, -0.1991,  0.1008,  0.0120,  0.0569,  0.2299,
          0.1011,  0.0444],
        [ 0.2415,  0.0025,  0.2758, -0.2346,  0.3705,  0.2363, -0.3213, -0.3495,
          0.1680,  0.6126],
        [ 0.1619, -0.4820,  0.1393,  0.1305, -0.1933, -0.6121,  0.2245, -0.1609,
         -0.4278,  0.3423]], requires_grad=True)), ('lstm.weight_ih_l0', Parameter containing:
tensor([[ 5.4835e-02,  5.2781e-02,  2.3842e-01, -2.3993e-01, -7.1917e-02,
          1.6688e-01,  1.7426e-02, -1.1794e-03, -9.6998e-02, -2.5311e-01],
        [ 2.0812e-01,  1.2616e-01,  3.0241e-02, -9.9474e-02,  2.3173e-01,
         -2.1247e-01, -2.6304e-02, -3.5790e-02, -1.4537e-01,  1.9415e-01],
        [-1.1740e-01,  1.2091e-01,  1.0645e-01