# NW AAC
Next-word prediction for AAC board using Gensim and Word2Vec

Uses `contractions` to remove contractions from data.

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.models import Word2Vec

import re
import contractions
from collections import Counter

In [2]:
# read data, convert to lists
df = pd.read_csv("Conversation.csv")

qlist = df["question"].tolist()
alist = df["answer"].tolist()
print(qlist[:5])
print(len(qlist), len(alist))

df.head()

['hi, how are you doing?', "i'm fine. how about yourself?", "i'm pretty good. thanks for asking.", 'no problem. so how have you been?', "i've been great. what about you?"]
3725 3725


Unnamed: 0,num,question,answer
0,0,"hi, how are you doing?",i'm fine. how about yourself?
1,1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,3,no problem. so how have you been?,i've been great. what about you?
4,4,i've been great. what about you?,i've been good. i'm in school right now.


In [3]:
# remove contractions
qlist = [contractions.fix(l) for l in qlist]
alist = [contractions.fix(l) for l in alist]

print(len(qlist), len(alist))

print(qlist[765:772])
print(alist[765:772])

# does not remove possessives, though
print(qlist[785:792])
print(alist[785:792])

3725 3725
['poodles bark a lot.', 'they sure do.', 'they bark at everything.', 'they never shut up.', 'why did you get a poodle?', "it is my mom's dog.", 'so she likes poodles.']
['they sure do.', 'they bark at everything.', 'they never shut up.', 'why did you get a poodle?', "it is my mom's dog.", 'so she likes poodles.', 'she says they are good watchdogs.']
['we have not been in a while.', 'we have not been in a month.', 'the last time we went, you almost drowned.', 'no, i did not.', 'then why did the lifeguard dive into the water?', 'i think he wanted to cool off.', 'he swam right up to you.']
['we have not been in a month.', 'the last time we went, you almost drowned.', 'no, i did not.', 'then why did the lifeguard dive into the water?', 'i think he wanted to cool off.', 'he swam right up to you.', 'and then he turned right around.']


In [4]:
# remove punctuation, numbers, signs. except for remaining apostrophes
qlist = [re.sub(r"[^a-zA-Z\s']", "", l) for l in qlist]
alist = [re.sub(r"[^a-zA-Z\s']", "", l) for l in alist]

print(len(qlist), len(alist))

print(qlist[765:772])
print(alist[765:772])

print(qlist[785:792])
print(alist[785:792])

3725 3725
['poodles bark a lot', 'they sure do', 'they bark at everything', 'they never shut up', 'why did you get a poodle', "it is my mom's dog", 'so she likes poodles']
['they sure do', 'they bark at everything', 'they never shut up', 'why did you get a poodle', "it is my mom's dog", 'so she likes poodles', 'she says they are good watchdogs']
['we have not been in a while', 'we have not been in a month', 'the last time we went you almost drowned', 'no i did not', 'then why did the lifeguard dive into the water', 'i think he wanted to cool off', 'he swam right up to you']
['we have not been in a month', 'the last time we went you almost drowned', 'no i did not', 'then why did the lifeguard dive into the water', 'i think he wanted to cool off', 'he swam right up to you', 'and then he turned right around']


In [5]:
# combine to compatible sentences nested lists (each word becomes indiv element of parent list)
sentences = list(map(lambda x : x.split(), qlist))
sentences.extend(map(lambda x : x.split(), alist))

print(len(sentences))
sentences[:5]

7450


[['hi', 'how', 'are', 'you', 'doing'],
 ['i', 'am', 'fine', 'how', 'about', 'yourself'],
 ['i', 'am', 'pretty', 'good', 'thanks', 'for', 'asking'],
 ['no', 'problem', 'so', 'how', 'have', 'you', 'been'],
 ['i', 'have', 'been', 'great', 'what', 'about', 'you']]

In [6]:
# put into W2V model & save
model = Word2Vec(sentences, vector_size=300, window=5, workers=4, epochs=10, min_count=5)
model.save("conv.model")

pretrained_weights = model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print(vocab_size, embedding_size)

1003 300


In [11]:
# average context's embed vectors and find most similar vector for next word
class WordPredictor:
    def __init__(self, sentences: list[list[str]], method: str = "word2vec", max_ngram: int = 5):
        """
        Initializes the predictor with training sentences

        Args:
            sentences: List of lists, inevitably loaded to W2V model
            method: Prediction model specification ("word2vec", "ngram", or "lstm")
            max_ngram: Maximum n-gram lookback size
        
        Methods:
            build_vocabulary: creates a word-to-index mapping from training sentences
            build_ngram_models: constructs multiple N-gram models with different context windows
            predict_next_word: predicts next word given current context
            prepare_lstm_data: prepares training data for LSTM model
            build_lstm_model: initializes and trains LSTM model
            predict_lstm: predicts next word using trained LSTM model
        """
        self.method = method
        self.sentences = sentences
        self.vocab = self.build_vocabulary()

        if method == "word2vec":
            self.model = Word2Vec(
                sentences, vector_size=100, window=5, min_count=1, workers=4
            )

        elif method == "lstm":
            self.prepare_lstm_data()
            self.build_lstm_model()

        else:
            self.max_ngram = max_ngram
            self.build_ngram_models()
    
    def build_vocabulary(self):
        """ Build vocabulary from sentences """

        words = []
        for sentence in sentences:
            words.extend(sentence)

        unique_words = set(words)

        vocab = {word: idx for idx, word in enumerate(unique_words)}
        
        return vocab

    def build_ngram_models(self):
        """
        Build multiple N-gram models from the training sentences
        Implements backoff in case phrase not found in training data
        """
        self.ngram_models = {}

        for n in range(self.max_ngram, 1, -1):
            self.ngram_models[n] = {} # access each model using its context size

            for sentence in self.sentences:
                for i in range(len(sentence) - n + 1): # iterate through all context windows of length `n`
                    prefix = tuple(sentence[i:i+n-1])
                    next_word = sentence[i+n-1]
                    if prefix not in self.ngram_models[n]:
                        self.ngram_models[n][prefix] = Counter()
                    self.ngram_models[n][prefix][next_word] += 1


    def predict_next_word(self, context, top_n=5):
        """
        Predict the next word given a context

        Args:
            context: List of words
            top_n: # of predictions to return
        
        Returns:
            List of (word, score) tuples
        """

        if self.method == "word2vec":
            context_vectors = [self.model.wv[word] for word in context]
            avg_vector = np.mean(context_vectors, axis=0)

            similar_words = self.model.wv.similar_by_vector(avg_vector, topn=top_n)

            return similar_words

        elif self.method == "lstm":
            return self.predict_lstm(context, top_n)
        
        else: # method = N-gram w/ backoff
            for n in range(self.max_ngram, 1, -1): # find the largest N-gram that works
                if len(context) >= n - 1:
                    prefix = tuple(context[-(n-1):])
                    if prefix in self.ngram_models[n]:
                        predictions = self.ngram_models[n][prefix].most_common(top_n)
                        total = sum(self.ngram_models[n][prefix].values())
                        return [(word, count / total) for word, count in predictions]
            
            # worst case just return most common words (unigram)
            if 2 in self.ngram_models:
                all_words = Counter()
                for ngram_dict in self.ngram_models[2].values():
                    all_words.update(ngram_dict)
                total = sum(all_words.values())
                return [(word, count / total) for word, count in all_words.most_common(top_n)]
            
            return [] # NULLGRAM (empty training dataset)
    
    def prepare_lstm_data(self):
        """
        Prepare data for LSTM training
        """

        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.vocab_size = len(self.vocab)

        self.sequence_length = 5 # context window size
        self.X = []
        self.y = []

In [13]:
wp = WordPredictor(sentences, method="ngram")


In [19]:
response = wp.predict_next_word(["i", "feel", "like", "i", "want", "to"])
print(response)

wl = ["i", "feel"]

for i in range(40):
    print(" ".join(wl))
    n = wp.predict_next_word(wl)[0][0]
    wl.append(n)

[('go', 0.22727272727272727), ('get', 0.13636363636363635), ('play', 0.09090909090909091), ('be', 0.06818181818181818), ('do', 0.045454545454545456)]
i feel
i feel sorry
i feel sorry for
i feel sorry for you
i feel sorry for you and
i feel sorry for you and you
i feel sorry for you and you will
i feel sorry for you and you will see
i feel sorry for you and you will see the
i feel sorry for you and you will see the collision
i feel sorry for you and you will see the collision if
i feel sorry for you and you will see the collision if they
i feel sorry for you and you will see the collision if they crash
i feel sorry for you and you will see the collision if they crash into
i feel sorry for you and you will see the collision if they crash into the
i feel sorry for you and you will see the collision if they crash into the house
i feel sorry for you and you will see the collision if they crash into the house at
i feel sorry for you and you will see the collision if they crash into the house