# Neural Network Classification: Word 2 Vector

----

## Dataset Preprocessing

### Importing the Dataset

In [2]:
import pickle
[lib, con, neutral] = pickle.load(open('ibcData.pkl', 'rb'))

lib = [sentence.get_words() for sentence in lib]
con = [sentence.get_words() for sentence in con]

reviews = lib + con
positions = [0]*len(lib) + [1]*len(con)

### Library Imports

In [3]:
# Default Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

# Word Tokenizing Imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem.porter import PorterStemmer

# Model Evaluation Imports
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# 'stopwords' Library Import
from re import sub
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Creating the Word2Vec

In [4]:
# Initializing Stopwords list
ignored_words = set(stopwords.words('english'))
ignored_words.remove('not')

# Initializing the tokenizing classes.
stemmer = PorterStemmer()
tokenizer = Tokenizer(oov_token = '<OOV>')
corpus = []

for review in reviews:
    # Standardizing character range to a-z.
    review = review.lower()
    review = sub('[^a-z]', ' ', review)

    # Removing less valuable words: the 'stopwords'
    review = review.split()
    review = [stemmer.stem(word) for word in review if not word in ignored_words]
    review = ' '.join(review)
    
    corpus.append(review)


tokenizer.fit_on_texts(corpus)
X = tokenizer.texts_to_sequences(corpus)
X = pad_sequences(X, padding='post')
y = positions

### Creating Training & Testing Values

In [5]:
X = np.array(X)
y = np.array(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

----

## Creating the Model

### Basic Model

In [None]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding((len(tokenizer.word_index) + 1), 16),
      tf.keras.layers.GlobalMaxPooling1D(),
      tf.keras.layers.Dense(16, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          132432    
_________________________________________________________________
global_max_pooling1d (Global (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 132,721
Trainable params: 132,721
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4af31f7550>

In [None]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6207513416815742
0.5564853556485356
[[214 103]
 [109 133]]


### Increasing Embedding Dimensions

In [None]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding((len(tokenizer.word_index) + 1), 32),
      tf.keras.layers.GlobalMaxPooling1D(),
      tf.keras.layers.Dense(32, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          264864    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 265,953
Trainable params: 265,953
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4af34db7b8>

In [None]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6046511627906976
0.5099778270509978
[[223 121]
 [100 115]]


### Further Increasing Embedding Dimensions

In [None]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding((len(tokenizer.word_index) + 1), 64),
      tf.keras.layers.GlobalMaxPooling1D(),
      tf.keras.layers.Dense(64, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          529728    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 533,953
Trainable params: 533,953
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4af369dd68>

In [None]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6225402504472272
0.5667351129363449
[[210  98]
 [113 138]]


### Increasing Dense Layer Count (Using 64 Dimension Embeddings)

In [None]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding((len(tokenizer.word_index) + 1), 64),
      tf.keras.layers.GlobalMaxPooling1D(),
      tf.keras.layers.Dense(64, activation = 'relu'),
      tf.keras.layers.Dense(32, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 64)          529728    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 33        
Total params: 536,001
Trainable params: 536,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4af3807dd8>

In [None]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.59391771019678
0.5725047080979284
[[180  84]
 [143 152]]


### Reducing Dense Layer Nodes (Using 64 Dimension Embeddings)

In [None]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding((len(tokenizer.word_index) + 1), 64),
      tf.keras.layers.GlobalMaxPooling1D(),
      tf.keras.layers.Dense(64, activation = 'relu'),
      tf.keras.layers.Dense(2, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 64)          529728    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 130       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 3         
Total params: 534,021
Trainable params: 534,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4af399bc18>

In [None]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6135957066189625
0.5423728813559322
[[215 108]
 [108 128]]


### Reducing Dense Layer Nodes (Using 16 Dimension Embeddings)

In [None]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding((len(tokenizer.word_index) + 1), 16),
      tf.keras.layers.GlobalMaxPooling1D(),
      tf.keras.layers.Dense(8, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 16)          132432    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 9         
Total params: 132,577
Trainable params: 132,577
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4af3943e10>

In [None]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6529516994633273
0.5800865800865802
[[231 102]
 [ 92 134]]
