Import libraries

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle
import warnings

warnings.filterwarnings('ignore')

Load the data

In [2]:
# Load the dataset from the CSV file
data = pd.read_csv('updated.csv')
data = data[:1750]
data

Unnamed: 0,Questions,Answers
0,What is your first name?,My full name is Emuejevoke Eshemitan.
1,Could you please spell your first name for me?,My full name is Emuejevoke Eshemitan.
2,Do you have any nicknames or alternate names y...,My full name is Emuejevoke Eshemitan.
3,"How do you pronounce your last name, Eshemitan?",My full name is Emuejevoke Eshemitan.
4,Do you have any middle names?,My full name is Emuejevoke Eshemitan.
...,...,...
1745,Is there something I can do to make things eas...,"I am good, thank you"
1746,Do you have any specific needs I can address?,"I am good, thank you"
1747,Can I assist you in any way to make things bet...,"I am good, thank you"
1748,Do you need any advice or guidance on a partic...,"I am good, thank you"


Shuffle the data

In [3]:
# shuffle the DataFrame rows
data = data.sample(frac = 1, ignore_index=True)
data

Unnamed: 0,Questions,Answers
0,In which year did your story begin?,I am 24 years old.
1,Do you have any favorite childhood memories wi...,I have three siblings: a younger brother (Omom...
2,What is the destination that you've always had...,I would love to visit Bali and Greece someday ...
3,Do you have any picturesque destinations on yo...,I would love to visit Bali and Greece someday ...
4,How do you find relief from stress or navigate...,"I relax, breathe, accept and work harder"
...,...,...
1745,Do you have any pet?,I don't have any pets.
1746,Are there any challenges or responsibilities t...,I am 24 years old.
1747,Are there any activities related to pet care ...,I don't have any pets.
1748,Are there any factor into your decision to not...,I don't have any pets.


Check for duplicated rows in the data

In [4]:
data.duplicated().sum()

10

Drop duplicated rows

In [5]:
data = data.drop_duplicates()
data.duplicated().sum()

0

Get number of classes or categories

In [6]:
# get number of categories
no_cat = data['Answers'].nunique()
no_cat

35

Check if the data is balanced

In [7]:
#M class has way less data than the orthers, thus the classes are unbalanced.
value_counts_table = pd.DataFrame(data['Answers'].value_counts())
print(value_counts_table)

                                                    count
Answers                                                  
I love dark, cloudy and rainy weather.                 50
I have three siblings: a younger brother (Omomi...     50
I am good, thank you                                   50
I am an easy-going and free-spirited person.           50
I possess a strong skill set in Python programm...     50
My Contact Information includes: Phone number: ...     50
I feel strongly about reducing carbon emissions...     50
My long-term personal goal outside of my career...     50
I enjoy all genres of music depending on my moo...     50
My birthday is february 26th                           50
"God Abeg" and "It is what it is" are Nigerian ...     50
I detest unclean surroundings, avoid oily food,...     50
I am passionate about using machine learning ap...     50
My ideal weekend involves coding and occasional...     50
I don't have any pets.                                 50
I am an introv

label encode target

In [8]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the target
label_encoder.fit(data['Answers'])

# Perform label encoding on the target
encoded_answers = label_encoder.transform(data['Answers'])

Save the label encoder

In [9]:
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
# add labels to dataset
data['answer_labels'] = encoded_answers
data

Unnamed: 0,Questions,Answers,answer_labels
0,In which year did your story begin?,I am 24 years old.,2
1,Do you have any favorite childhood memories wi...,I have three siblings: a younger brother (Omom...,18
2,What is the destination that you've always had...,I would love to visit Bali and Greece someday ...,26
3,Do you have any picturesque destinations on yo...,I would love to visit Bali and Greece someday ...,26
4,How do you find relief from stress or navigate...,"I relax, breathe, accept and work harder",23
...,...,...,...
1745,Do you have any pet?,I don't have any pets.,13
1746,Are there any challenges or responsibilities t...,I am 24 years old.,2
1747,Are there any activities related to pet care ...,I don't have any pets.,13
1748,Are there any factor into your decision to not...,I don't have any pets.,13


One-Hot encode the target

In [11]:
encoded_labels = to_categorical(encoded_answers, num_classes=no_cat)
print(encoded_labels[:10])

[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0.

Tokenize data

In [12]:
# Initialize the tokenizer
tokenizer = Tokenizer(lower=True)

# Fit on predictor data (the questions data)
tokenizer.fit_on_texts(data['Questions'].values)
sequences = tokenizer.texts_to_sequences(data['Questions'].values) # converts sentence to vectors

# Print no of unique words or tokens
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 2020 unique tokens.


Save trained tokenizer

In [13]:
# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

check the max lenght of the sequence

In [14]:
max_len = max([len(x) for x in sequences])
print(f"maximum length of sequence: {max_len}")

maximum length of sequence: 23


Now we pad the sequence to have the same length

In [15]:
X = pad_sequences(sequences, 23) # This will be the data we use to train

Split the data into training and testing data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X , encoded_labels, test_size=0.20, random_state=42)

Build The Model

In [17]:
# Embedding dimensionality
emb_dim = 200

# Note we actually want the size of the embedding to be the length of the tokenizer index + 1
emb_size = len(tokenizer.word_index) + 1 # because token starts with 1 and not 0

i = Input(shape=(X.shape[1],)) # Input shape should be sequence lemgth
x = Embedding(emb_size, emb_dim)(i)
x = LSTM(64, dropout=0.6, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(no_cat, activation='softmax')(x)

model = Model(i, x)

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy', 
              metrics=['acc'])

print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 23)]              0         
                                                                 
 embedding (Embedding)       (None, 23, 200)           404200    
                                                                 
 lstm (LSTM)                 (None, 23, 64)            67840     
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 35)                2275      
                                                                 
Total params: 474315 (1.81 MB)
Trainable params: 474315 (1.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________

In [18]:
history = model.fit(X_train,
                    y_train, epochs=100,
                    batch_size = 128,
                    validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Save the model

In [19]:
model.save('model.h5')

## Test the model

Import libraries

In [20]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
import random
import json

max_len = 23  # max length of the sequence used to train te model is 23

Load the model

In [21]:
model = load_model('model.h5')

Load the tokenizer

In [22]:
with open('tokenizer.pickle', 'rb') as t_handle:
    tokenizer = pickle.load(t_handle)

Load the label encoder

In [23]:
with open('label_encoder.pickle', 'rb') as l_handle:
    label_encoder = pickle.load(l_handle)

Load the rephrase JSON file and convert it to a dictionary

In [24]:
with open('dict_phrases.json', 'r') as file:
    dict_rephrase = json.load(file)

Collect user input and process

Generate prediction and rephrase

In [25]:
# Prepare user input
user_input = input("Enter a question: ")

# Tokenize and pad the user input
user_sequence = tokenizer.texts_to_sequences([user_input])
user_X = pad_sequences(user_sequence, maxlen=max_len)

# Make prediction
prediction = model.predict(user_X)

# Decode the prediction
decoded_prediction = label_encoder.inverse_transform(np.argmax(prediction, axis=1))

key = decoded_prediction[0]
random.choice(dict_rephrase[key])

Enter a question:  how old are you




'I have just entered the age of 24 and look forward to the experiences it will bring'

Generate original prediction

In [26]:
# Prepare user input
user_input = input("Enter a question: ")

# Tokenize and pad the user input
user_sequence = tokenizer.texts_to_sequences([user_input])
user_X = pad_sequences(user_sequence, maxlen=max_len)

# Make prediction
prediction = model.predict(user_X)

# Decode the prediction
decoded_prediction = label_encoder.inverse_transform(np.argmax(prediction, axis=1))

# Print the question
print('Question:', user_input)

# Print the prediction
print('Prediction:', decoded_prediction[0])

Enter a question:  who are you


Question: who are you
Prediction: I am Nigerian, I was born and raised in Delta State.


Generate all predictions with their probability

In [27]:
# Prepare user input
user_input = input("Enter a question: ")

# Tokenize and pad the user input
user_sequence = tokenizer.texts_to_sequences([user_input])
user_X = pad_sequences(user_sequence, maxlen=max_len)

# Make prediction
prediction = model.predict(user_X)

# Get the probabilities for each class
class_probabilities = prediction[0]

# Sort the probabilities in descending order
sorted_indices = np.argsort(class_probabilities)[::-1]
sorted_probabilities = class_probabilities[sorted_indices]

# Print the sorted class probabilities
for i, (class_index, probability) in enumerate(zip(sorted_indices, sorted_probabilities), 1):
    class_label = label_encoder.inverse_transform([class_index])[0]
    print(f'{i}. Class: {class_label}  Probability: {probability:.4f}')

# Get the two most probable answers
top_indices = np.argsort(prediction, axis=1)[0, -2:][::-1]
top_answers = label_encoder.inverse_transform(top_indices)

# Print the predictions
print('Question:', user_input)
print('Predicted Answers:')
for answer in top_answers:
    print(answer)

Enter a question:  what is your name


1. Class: My full name is Emuejevoke Eshemitan.  Probability: 0.9519
2. Class: I am Nigerian, I was born and raised in Delta State.  Probability: 0.0189
3. Class: I am a Data Scientist / Machine Learning Engineer.  Probability: 0.0048
4. Class: I am single  Probability: 0.0045
5. Class: my phobia is "peniaphobia"  also known as  "poverty phobia".  Probability: 0.0036
6. Class: My Contact Information includes: Phone number: +2349024362357, Email: eshemitanvoke@gmail.com, website: https://github.com/Davidsonity  Probability: 0.0027
7. Class: I am 24 years old.  Probability: 0.0024
8. Class: I am good, thank you  Probability: 0.0022
9. Class: I possess a strong skill set in Python programming, SQL, machine learning, data processing, TensorFlow, scikit-learn, mathematics, effective communication, Git/GitHub, deep learning, recommendation systems and Google Cloud Platform (GCP).  Probability: 0.0014
10. Class: I don't have any pets.  Probability: 0.0012
11. Class: I enjoy all genres of musi