In [74]:
import pandas as pd
import numpy as np

import spacy
nlp = spacy.load('en_core_web_sm')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam


In [3]:
df = pd.read_csv('WhatsApp_chat.txt') #Replace with your path to chat file

## Data Pre-processing

In [None]:
lines = list(df['ColumnName']) # Replace 'ColumnName' with name of column having all the messages

In [47]:
# Removing media files and messages of other people 

cleaned_lines = [line for line in lines if 'YourName' in line and 'Media' not in line] # Replace 'YourName' with your name

In [48]:
# Keeping only relevant words from the data

lines_with_tokens = []
for line in cleaned_lines:
    doc= nlp(line)
    tokens=[(token.text) for token in doc if (token.text).isalpha() and (token.text) != 'pm' and (token.text) != 'am' and (token.text) != 'YourName'] # Replace 'YourName' with your name
    lines_with_tokens.append(tokens)

In [49]:
# Removing empty lines and lines with a single word

only_nonempty_lines = [line for line in lines_with_tokens if len(line) >= 2]

In [52]:
# Joining words to form sentences again

final = [" ".join(line) for line in only_nonempty_lines]

In [58]:
# Tokenizing the words in our data

tokenizer = Tokenizer()
tokenizer.fit_on_texts(final)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  6921


In [60]:
tokenizer.texts_to_sequences([['a','man','a','kidding',]])[0]

[73, 686, 73, 5907]

In [None]:
# Convert data to sequence of tokens

input_sequences = []
for line in final:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    # Create a series of sequences for each sentence
    for i in range(1, len(token_list)):
        partial_sequence = token_list[:i+1]
        input_sequences.append(partial_sequence)

In [62]:
input_sequences

[[434, 435],
 [434, 435, 35],
 [185, 16],
 [185, 16, 42],
 [88, 130],
 [22, 555],
 [22, 555, 5],
 [22, 555, 5, 96],
 [100, 201],
 [100, 201, 343],
 [4, 595],
 [4, 595, 66],
 [1, 735],
 [1, 735, 556],
 [87, 99],
 [87, 99, 114],
 [87, 99, 114, 1651],
 [87, 99, 114, 1651, 9],
 [87, 99, 114, 1651, 9, 1652],
 [87, 99, 114, 1651, 9, 1652, 31],
 [8, 30],
 [8, 30, 90],
 [8, 30, 90, 2154],
 [8, 30, 90, 2154, 23],
 [8, 30, 90, 2154, 23, 3],
 [8, 30, 90, 2154, 23, 3, 113],
 [2154, 1006],
 [1, 735],
 [87, 248],
 [87, 248, 1652],
 [99, 43],
 [7, 40],
 [102, 63],
 [102, 63, 520],
 [74, 63],
 [74, 63, 3253],
 [3254, 40],
 [596, 1653],
 [596, 1653, 79],
 [596, 1653, 79, 114],
 [596, 1653, 79, 114, 2155],
 [596, 1653, 79, 114, 2155, 10],
 [636, 381],
 [636, 381, 5],
 [27, 3255],
 [27, 3255, 6],
 [27, 3255, 6, 7],
 [37, 4],
 [37, 4, 148],
 [37, 78],
 [37, 78, 1007],
 [32, 382],
 [32, 382, 202],
 [1150, 78],
 [22, 78],
 [4, 112],
 [4, 112, 1654],
 [112, 71],
 [112, 71, 79],
 [112, 71, 79, 69],
 [112, 71,

In [63]:
# Capping length of sequence to be 50

i = 0
while i < len(input_sequences):
  if len(input_sequences[i]) > 50:
    x = input_sequences[i][0:50]
    y = input_sequences[i][50:len(input_sequences[i])]
    del input_sequences[i]
    input_sequences.append(x)
    input_sequences.append(y)
  else:
    i +=1

In [67]:
# Determine max sequence length

max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences with zeros at the beginning to make them all max length

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [68]:
# Predictors are every word except the last

predictors = input_sequences[:,:-1]

# Labels are the last word

labels = input_sequences[:,-1]

In [70]:
labels = utils.to_categorical(labels, num_classes=total_words)

## Creating model and training it

In [76]:
# Input is max sequence length - 1, as we've removed the last word for the label

input_len = max_sequence_len - 1 

model = Sequential()

model.add(Embedding(total_words, 73, input_length=input_len))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(200))
model.add(Dropout(0.1))

model.add(Dense(total_words, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 49, 73)            505233    
                                                                 
 lstm_2 (LSTM)               (None, 49, 100)           69600     
                                                                 
 lstm_3 (LSTM)               (None, 200)               240800    
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 6921)              1391121   
                                                                 
Total params: 2,206,754
Trainable params: 2,206,754
Non-trainable params: 0
_________________________________________________________________


In [75]:
opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, labels, batch_size=64, epochs = 25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f0e39f168d0>

In [None]:
model.save("prediction_model.h5")

## Making Predictions

In [None]:
from keras.models import load_model

model = load_model("prediction_model.h5")

input = 'text' # Replace 'text' with some sentence that you might type

num_words = 3 # Number of predictons to make ahead

for _ in range(num_words):
  token_list = tokenizer.texts_to_sequences([input])[0]
  input_sequence = np.array(pad_sequences([token_list], maxlen=max_sequence_len- 1, padding='pre'))
  prediction_value = model.predict(input_sequence)
  prediction_text = tokenizer.sequences_to_texts([[np.argmax(prediction_value)]])[0]
  print (prediction_text)
  input += prediction_text