<a href="https://colab.research.google.com/github/BaileyDalton007/Epidemic-Simulator/blob/main/discord_rnn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import numpy as np
import pandas as pd
import string
import csv
from os import linesep

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

from gensim.models.word2vec import Word2Vec as w2v

**Uploading files**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
# Change to whatever path you have your files in
DATA_PATH = '/content/drive/MyDrive/discord_rnn_data/'

In [4]:
# Use training files from drive
import os

file_names = os.listdir(DATA_PATH + 'training_data/')

**Text Preprocessing**

In [5]:
# Prime max length seems to be around 15-20, see readme for graph
MAX_MSG_LENGTH = 20

In [6]:
# Create master array for messages from all files
msg_array = []

for file in file_names:
  file_path = DATA_PATH + 'training_data/' + file
  with open(file_path, 'r', encoding='utf8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')

    for row in csv_reader:
      msg_array.append(row[0].lower())

# Tokenize msg_array
msg_array = [sub.split() for sub in msg_array]

# Removes empty lists that get through somehow
msg_array = [ele for ele in msg_array if ele != []]

# Makes sure every message is the same length
for i, msg in enumerate(msg_array):
  msg_array[i] = msg[:MAX_MSG_LENGTH]

**Training / Loading Word2Vec Model**

In [7]:
VECTOR_SIZE = 100

In [8]:
# Load w2v model from drive
word_model = w2v.load(f'{DATA_PATH}word_models/word2vec_{VECTOR_SIZE}.model')

In [None]:
# Training the w2v model
word_model = w2v(msg_array, size=VECTOR_SIZE, min_count=2, window=5, iter=100)

In [10]:
# Saving w2v model to drive
word_model.save(f'{DATA_PATH}word_models/word2vec_{VECTOR_SIZE}.model')

In [9]:
# Defining some variables and functions to interact with the w2v model
pretrained_weights = word_model.wv.vectors
vocab_size, embedding_size = pretrained_weights.shape

def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

**Preparing Data for LSTM model** 

In [11]:
# Creates empty numpy arrays
train_x = np.zeros([len(msg_array), MAX_MSG_LENGTH], dtype=np.int32)
train_y = np.zeros([len(msg_array)], dtype=np.int32)
# Fills arrays, each row a message and each column a word
for i, msg in enumerate(msg_array):
  for t, word in enumerate(msg[:-1]):
    train_x[i, t] = word2idx(word)
  
  train_y[i] = word2idx(msg[-1])

**Defining the model**

In [None]:
BATCH_SIZE = 128

In [None]:
model = tf.keras.models.Sequential()

# Embedding layer for w2v model
model.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights]))

model.add(tf.keras.layers.LSTM(units=embedding_size))
model.add(tf.keras.layers.Dense(units=vocab_size))
model.add(tf.keras.layers.Activation('softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [120]:
# Define functions to interact with the model
# From https://gist.github.com/maxim5/c35ef2238ae708ccb0e55624e9e0252b


# Model outputs a vector of probabilites of the next word (preds), and this
# function will take that and choose the word generated based of the sampling
# described here: 
# https://medium.com/machine-learning-at-petiteprogrammer/sampling-strategies-for-recurrent-neural-networks-9aea02a6616f

def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)

  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

# Generates next num_generated words from the text
def generate_next(text, num_generated=10, return_probs=False):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  word_probs = []

  for i in range(num_generated):
    x = np.zeros([1, 20])
    for i, idx in enumerate(word_idxs):
      x[0, i] = idx

    prediction = model.predict(x)

    if return_probs:
      word_probs.append(max(prediction[0]))
      # Makes output deterministic if graphing
      idx = sample(prediction[-1], temperature=0)
    else:
      # Temperature is how variant the output will be
      idx = sample(prediction[-1], temperature=0.7)
    word_idxs.append(idx)

  if return_probs:
    # Get word probabilites to return as a list
    return ' '.join(idx2word(idx) for idx in word_idxs), word_probs

  return ' '.join(idx2word(idx) for idx in word_idxs)


# Displays a sample of the output after each epoch
def on_epoch_end(epoch, _):
  print(f'\nGenerating text after epoch: {epoch}')

  # Sample texts to print each epoch
  texts = [
    'sir it is time',
    'where are',
    'i am a',
    'my favorite',
  ]

  for text in texts:
    sample = generate_next(text)
    print(sample)


In [None]:
# Callback function to save model
checkpoint_path = DATA_PATH+'model_saves/cp-{epoch:04d}/model.ckpt'
checkpoint_save = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq=50*BATCH_SIZE)

**Training the Model**

In [None]:
progress_output = keras.callbacks.LambdaCallback(on_epoch_end=on_epoch_end)

# Define what callbacks you want called: checkpoint_save, progress_output
callbacks = [checkpoint_save]

In [None]:
history = model.fit(train_x, train_y,
                    batch_size=BATCH_SIZE,
                    epochs=1000, 
                    callbacks=callbacks)



**Predicting Text**

In [None]:
# Upload model if needed
model = tf.keras.models.Sequential()

# Embedding layer for w2v model
model.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights]))

model.add(tf.keras.layers.LSTM(units=embedding_size))
model.add(tf.keras.layers.Dense(units=vocab_size))
model.add(tf.keras.layers.Activation('softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Change to path of model that should be loaded
model.load_weights(DATA_PATH+'model_saves/simple_model_2/model.ckpt')

In [None]:
text = generate_next("william is")

text

In [144]:
# To graph confidence
text, probs = generate_next("william is", return_probs=True)

In [None]:
x = range(len(probs))
y = probs

text_arr = text.split()[-len(probs):]
word_num = 0
for lx,ly in zip(x,y):
  
  word = text_arr[word_num]

  plt.annotate(word, (lx,ly))
  word_num += 1

plt.xticks(x)
plt.plot(x, y)
#plt.show()