## ML Model Classification: Word 2 Vector

----

## Dataset Preprocessing

### Importing the Dataset

In [6]:
import pickle
[lib, con, neutral] = pickle.load(open('ibcData.pkl', 'rb'))

lib = [sentence.get_words() for sentence in lib]
con = [sentence.get_words() for sentence in con]

reviews = lib + con
positions = [0]*len(lib) + [1]*len(con)

### Library Imports

In [7]:
# Default Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

# Word Tokenizing Imports
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Model Evaluation Imports
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


# 'stopwords' Library Import
from re import sub
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Creating the Bag of Words

In [8]:
# Initializing Stopwords list
ignored_words = set(stopwords.words('english'))
ignored_words.remove('not')

# Initializing the tokenizing classes.
stemmer = PorterStemmer()
vectorizer = CountVectorizer()
corpus = []

for review in reviews:
    # Standardizing character range to a-z.
    review = review.lower()
    review = sub('[^a-z]', ' ', review)

    # Removing less valuable words: the 'stopwords'
    review = review.split()
    review = [stemmer.stem(word) for word in review if not word in ignored_words]
    review = ' '.join(review)
    
    corpus.append(review)

# Creating X and y
X = vectorizer.fit_transform(corpus).toarray()
y = positions

### Creating Training & Testing Values

In [9]:
X = np.array(X)
y = np.array(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

----

# Models

## Linear Classification

In [10]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1, activation = 'relu', input_shape = [8260])
])

model.summary()
 
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 8261      
Total params: 8,261
Trainable params: 8,261
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa5b04aa6a0>

In [11]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.5957066189624329
0.47926267281105994
[[229 132]
 [ 94 104]]


### Additional Dense Layer

In [13]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(8260, activation = 'relu', input_shape = [8260]),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.summary()

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8260)              68235860  
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 8261      
Total params: 68,244,121
Trainable params: 68,244,121
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa54a795f98>

In [14]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6225402504472272
0.5422993492407809
[[223 111]
 [100 125]]


### 2 Additional Dense Layers

In [21]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(8260, activation = 'relu', input_shape = [8260]),
    tf.keras.layers.Dense(8260, activation = 'relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 8260)              68235860  
_________________________________________________________________
dense_11 (Dense)             (None, 8260)              68235860  
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 8261      
Total params: 136,479,981
Trainable params: 136,479,981
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa54a2027b8>

In [22]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.631484794275492
0.5339366515837104
[[235 118]
 [ 88 118]]


### Halving Nodes in 2nd Layer

In [23]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(8260, activation = 'relu', input_shape = [8260]),
    tf.keras.layers.Dense(4130, activation = 'relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 8260)              68235860  
_________________________________________________________________
dense_14 (Dense)             (None, 4130)              34117930  
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 4131      
Total params: 102,357,921
Trainable params: 102,357,921
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa54a16b6a0>

In [24]:
predictions = [round(value[0]) for value in model.predict(X_test)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6171735241502684
0.5523012552301255
[[213 104]
 [110 132]]


### Multiple Decreasing Dense Layers: Model 1


In [None]:
model = keras.models.Sequential([
    tf.keras.layers.Dense(8260, activation='relu', input_shape=[8260]),
    tf.keras.layers.Dense(4130, activation='relu'),
    tf.keras.layers.Dense(500, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer = RMSprop(lr = 0.001), loss = 'binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa9b5f05780>

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

predictions = [round(value[0]) for value in model.predict(X_test, batch_size=10)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6367292225201072
0.5849923430321592
[[284 135]
 [136 191]]


### Multiple Decreasing Dense Layers: Model 2

In [None]:
model = keras.models.Sequential([
    tf.keras.layers.Dense(8260, activation = 'relu', input_shape=[8260]),
    tf.keras.layers.Dense(4130, activation='relu'),
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer = RMSprop(lr = 0.001), loss = 'binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa9b5b45c18>

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

predictions = [round(value[0]) for value in model.predict(X_test, batch_size=10)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6407506702412868
0.6047197640117994
[[273 121]
 [147 205]]


### 1000 Hidden Layer Nodes to Output

In [None]:
model = keras.models.Sequential([
    tf.keras.layers.Dense(8260, activation='relu', input_shape=[8260]),
    tf.keras.layers.Dense(8260, activation='relu'),
    tf.keras.layers.Dense(4130, activation='relu'),
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer = RMSprop(lr = 0.001), loss = 'binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa9b78b46a0>

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

predictions = [round(value[0]) for value in model.predict(X_test, batch_size=10)]
print(accuracy_score(predictions, y_test))
print(f1_score(predictions, y_test))
print(confusion_matrix(predictions, y_test))

0.6434316353887399
0.6144927536231884
[[268 114]
 [152 212]]
