<a href="https://colab.research.google.com/github/BaileyDalton007/discord_rnn/blob/main/discord_dnn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from gensim.models.word2vec import Word2Vec as w2v
from google.colab import files

In [None]:
# If needed upload w2v model
model_save_file = files.upload()

In [None]:
# Upload training csv's
uploaded = files.upload()
file_names = list(uploaded.keys())

In [130]:
# Load w2v model
wv_model = w2v.load('word2vec.model')

**Data Pre-processing**

In [176]:
# The uniform word count for each message
# Shorter ones will be 0-padded and longer will be truncated
WORD_COUNT = 20

# Dimensionality of word vectors
VECTOR_DIM = 100

In [177]:
# Load all csv's into one master dataframe
frames_to_combine = []

for file in file_names:
  data = pd.read_csv(file, delimiter=',')
  data.reset_index(drop=True)

  frames_to_combine.append(data)

# Combines all csv files into one dataframe to be processed
master_df = pd.concat(frames_to_combine)
master_df = master_df.reset_index(drop=True)

# Tokenize training data
d_set_arr = [master_df['Tmsg0'], master_df['Tmsg1'], master_df['Tmsg2'], master_df['Umsg']]

tokenized_d_set = []

for d_set in d_set_arr:
  tmp = []
  for sub in d_set:

    # Some float values were getting through, this makes everything a string
    if type(sub) != 'str':
      sub = str(sub)
    
    # Max of split words is out max word count
    tmp.append(sub.lower().split(maxsplit=WORD_COUNT))

  tokenized_d_set.append(tmp)

Tmsg0 = tokenized_d_set[0]
Tmsg1 = tokenized_d_set[1]
Tmsg2 = tokenized_d_set[2]
Umsg = tokenized_d_set[3]

**Now for word embedding**

In [178]:
def get_word_vector(word):
  # Checks if word is in model's vocabulary
  if word in wv_model.wv.vocab:
    return wv_model.wv.__getitem__(word)
  else:
    return np.zeros(VECTOR_DIM)


In [179]:
word_vector_sets = []

for d_set in tokenized_d_set:
  output_d_set = []

  for sequence in d_set:
    seq_len = len(sequence)
    output_seq = []

    for word_num in range(WORD_COUNT):
      # Zero padding for messages shorter than WORD_COUNT
      if word_num < seq_len:
        word = sequence[word_num]

        word_vec = get_word_vector(word)
      else:
        word_vec = np.zeros(VECTOR_DIM)
      
      output_seq.append(word_vec)

    output_d_set.append(output_seq)

  word_vector_sets.append(output_d_set)

In [297]:
def func_word2vec(seq):
  seq_len = len(seq)
  output_seq = []

  for word_num in range(WORD_COUNT):
    # Zero padding for messages shorter than WORD_COUNT
    if word_num < seq_len:
      word = seq[word_num]

      word_vec = get_word_vector(word)
    else:
      word_vec = np.zeros(VECTOR_DIM)
    
    output_seq.append(word_vec)

  return output_seq


In [180]:
# Function for converting a sequence of word vectors back to text

def vec2word(seq):
  output_seq = []

  for vec in seq:
    word = wv_model.wv.most_similar(positive=[vec], topn=1)

    # If similarity score is zero, give a blank word
    if word[0][1] == 0.0:
      word = [['']]

    output_seq.append(word[0][0])

  return output_seq

**Splitting Data**

In [None]:
# Amount of items that will be fed to the model for training
TRAINING_PERCENTAGE = 0.80

training_amount = int(TRAINING_PERCENTAGE * len(word_vector_sets[0])) 

training_x = []
for arr in word_vector_sets[:3]:
  training_x.append(arr[:training_amount])
training_x = np.array(training_x)
training_x = np.moveaxis(training_x, 0, 1)

training_y = np.array(word_vector_sets[3][:training_amount])

test_x = []
for arr in word_vector_sets[:3]:
  test_x.append(arr[training_amount:])
test_x = np.array(test_x)
test_x = np.moveaxis(test_x, 0, 1)

test_y = np.array(word_vector_sets[3][training_amount:])

**Defining our model**

In [None]:
### Not expecting this model to perform well at all, but curious what kinds
### Of outputs it will give

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(3, WORD_COUNT, VECTOR_DIM)))
model.add(tf.keras.layers.Dense(1024, activation='relu'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(2000, activation='relu'))

model.add(tf.keras.layers.Reshape((WORD_COUNT, VECTOR_DIM)))

In [311]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_29 (Dense)            (None, 3, 20, 512)        51712     
                                                                 
 dense_30 (Dense)            (None, 3, 20, 100)        51300     
                                                                 
 flatten_4 (Flatten)         (None, 6000)              0         
                                                                 
 dense_31 (Dense)            (None, 2000)              12002000  
                                                                 
 reshape_1 (Reshape)         (None, 20, 100)           0         
                                                                 
Total params: 12,105,012
Trainable params: 12,105,012
Non-trainable params: 0
_________________________________________________________________


In [250]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

**Train the model**

In [251]:
model.fit(training_x, training_y, epochs=10)  # we pass the data, labels and epochs and watch the magic!

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd0996c1b90>

In [252]:
test_loss, test_acc = model.evaluate(test_x,  test_y, verbose=1) 

print('Test accuracy:', test_acc)

Test accuracy: 0.004945055115967989


**Testing the model**

In [350]:
input_msgs = [['okay', 'so', 'first', 'iteration', 'of', 'neural', 'network', 'is', 'just', 'mid', 'stroke'],
              ['woah', 'thats', 'really', 'sick'],
              ['huh', 'it', 'came', 'out']]

In [351]:
input_vecs = []
for msg in input_msgs:
  input_vecs.append(func_word2vec(msg))

In [None]:
# Turn back into strings to make sure vectors were converted correctly
for input_msg in input_vecs:
  print(vec2word(input_msg))

In [353]:
predictions = model.predict(np.array([input_vecs]))

In [None]:
print(vec2word(predictions[0]))