<a href="https://colab.research.google.com/github/BaileyDalton007/discord_rnn/blob/main/discord_rnn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from gensim.models.word2vec import Word2Vec as w2v
from google.colab import files

In [None]:
# If needed upload w2v model
model_save_file = files.upload()

# Upload training csv's
uploaded = files.upload()

In [5]:
# Load w2v model
file_names = list(uploaded.keys())
wv_model = w2v.load('word2vec (2).model')

In [6]:
# Dimensionality of word vectors
VECTOR_DIM = 20

In [7]:
# Load all csv's into one master dataframe
frames_to_combine = []

for file in file_names:
  data = pd.read_csv(file, delimiter=',')
  data.reset_index(drop=True)

  frames_to_combine.append(data)

# Combines all csv files into one dataframe to be processed
master_df = pd.concat(frames_to_combine)
master_df = master_df.reset_index(drop=True)

messages = master_df['Message']
tokenized_msgs = []

for message in messages:

  # Tokenize messages
  tokenized_msgs.append(message.lower().split())

# One stream of messages with terminating symbols
data = [j for i in tokenized_msgs for j in i]

In [8]:
def get_word_vector(word):
  # Checks if word is in model's vocabulary
  if word in wv_model.wv.vocab:
    return wv_model.wv.__getitem__(word)
  else:
    return np.zeros(VECTOR_DIM)

In [9]:
def tokenize_string(string):
  res = [sub.split() for sub in string]
  return res

In [10]:
def func_word2vec(seq):
  output_seq = []

  for word in seq:
    word_vec = get_word_vector(word)
    
    output_seq.append(word_vec)

  return output_seq

In [11]:
# Function for converting a sequence of word vectors back to text

def vec2word(seq):
  output_seq = []

  for vec in seq:
    word = wv_model.wv.most_similar(positive=[vec], topn=1)

    # If similarity score is zero, give a blank word
    if word[0][1] == 0.0:
      word = [['']]

    output_seq.append(word[0][0])

  return output_seq

In [44]:
# Length of sequence that data will be split into and model will be trained on
seq_length = 5
input_length = len(data)

x = []
y = []

for i in range(0, input_length - seq_length, 1):
    # Define input and output sequences
    # Input is the current word plus desired sequence length
    in_seq = data[i:i + seq_length]

    # Out sequence is the initial word plus total sequence length
    out_seq = data[i + seq_length]

    # Vectorize words before adding to data
    x.append(func_word2vec(in_seq))
    y.append(func_word2vec([out_seq]))

In [46]:
# Define the model

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(256, input_shape=(seq_length, VECTOR_DIM), return_sequences=True))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.LSTM(256, return_sequences=True))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(VECTOR_DIM, activation='softmax'))

In [40]:
# Define the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(128, input_shape=(seq_length, VECTOR_DIM)))
model.add(tf.keras.layers.Dense(VECTOR_DIM, activation="softmax"))

In [47]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.summary()

In [None]:
checkpoint_path = './model_saves/'
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 period = 1,
                                                 verbose=1)

In [50]:
history = model.fit(x, y, validation_split=0.10, verbose = 1, epochs=2, callbacks=[cp_callback])

KeyboardInterrupt: ignored

In [None]:
test_loss, test_acc = model.evaluate(x,  y, verbose=1) 

print('Test accuracy:', test_acc)