In [67]:
#%run Data_Wrangling_and_EDA.ipynb

In [25]:
#Dividing up the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [26]:
#Writing the script for the embedding layer as part of the pre-processing step. The embedding layer will convert the textual
#data into numeric data and serves as the first layer for Keras' deep learning models.

#Implementing tokenizer class to create a word-to-index dictionary. Key:value --> word:unique index
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [27]:
#Setting max size of each list to 100; longer lists will be truncated and shorter lists will be 'padded' with 0's at the end
#of those lists.
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [29]:
#Glove embeddings used to create feature matrix. Loading glove embeddings to create a dictionary where keys:values are
#words:embeddings lists
from numpy import array
from numpy import asarray
from numpy import zeros

embedded_dict = dict()
g_file = open('glove.6B.100d.txt', encoding='utf8')

for line in g_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embedded_dict [word] = vector_dimensions
g_file.close()

In [31]:
#Creating the embedding matrix. Number of rows will match the number of words and each columns (100 in total) will have the
#Glove word embeddings.
embed_mat = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embed_vec = embedded_dict.get(word)
    if embed_vec is not None:
        embed_mat[index] = embed_vec

In [35]:
#Model 1: text classification with SNN (simple neural network)
snn = Sequential()
embedded_layer = Embedding(vocab_size, 100, weights=[embed_mat], input_length=maxlen, trainable=False)
snn.add(embedded_layer)

snn.add(Flatten())
snn.add(Dense(1, activation='sigmoid'))

In [39]:
#Compiling the SNN model and getting a summary of the parameters
snn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(snn.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          1899000   
                                                                 
 flatten_1 (Flatten)         (None, 10000)             0         
                                                                 
 dense_1 (Dense)             (None, 1)                 10001     
                                                                 
Total params: 1,909,001
Trainable params: 10,001
Non-trainable params: 1,899,000
_________________________________________________________________
None


In [49]:
#Importing and instantiating a callback (early stopping) to control for overfitting
from keras.callbacks import EarlyStopping

call_back = EarlyStopping(monitor='val_loss', patience=2, verbose=0, mode='min')

In [50]:
#Training the model on the train set
history = snn.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2, callbacks=call_back)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6


In [51]:
#Evaluating the performance of the model
result = snn.evaluate(X_test, y_test, verbose=1)



In [52]:
#Verifying both the accuracy and loss of the testing set
print('test score:', result[0])
print('test accuracy:', result[1])

test score: 0.8134480118751526
test accuracy: 0.7419566512107849


In [55]:
#Model 2: text classification with CNN (Convolutional Neural Network)
from keras.layers import Conv1D

cnn = Sequential()

embed_layer = Embedding(vocab_size, 100, weights=[embed_mat], input_length=maxlen, trainable=False)
cnn.add(embed_layer)

cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dense(1, activation='sigmoid'))

In [56]:
#Compiling the CNN model and getting a summary of the parameters
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(cnn.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 100)          1899000   
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,963,257
Trainable params: 64,257
Non-trainable params: 1,899,000
_________________________________________________________________
None


In [57]:
#Training the model on the training set
history_2 = cnn.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [58]:
#Evaluating the performance of the model
result_2 = cnn.evaluate(X_test, y_test, verbose=1)



In [59]:
#Verifying both the accuracy and loss of the testing set
print("test score:", result_2[0])
print("test accuracy", result_2[1])

test score: 0.4513889253139496
test accuracy 0.7977675795555115


In [61]:
#Model 3: text classification with RNN (Recurrent Neural Netowrk)
from keras.layers import LSTM

rnn = Sequential()
embedd_layer = Embedding(vocab_size, 100, weights=[embed_mat], input_length=maxlen, trainable=False)
rnn.add(embedd_layer)
rnn.add(LSTM(128))

rnn.add(Dense(1, activation='sigmoid'))

In [62]:
#Compiling the RNN model and getting a summary of the parameters
rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(rnn.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 100)          1899000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,016,377
Trainable params: 117,377
Non-trainable params: 1,899,000
_________________________________________________________________
None


In [63]:
#Training the model on the training set
history_3 = rnn.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [64]:
#Evaluating the performance of the model
result_3 = rnn.evaluate(X_test, y_test, verbose=1)



In [66]:
#Verifying both the accuracy and loss of the testing set
print('test score:', result_3[0])
print('test accuracy:', result_3[1])

test score: 0.6825509071350098
test accuracy: 0.5738673806190491
