# Model Classification: Word 2 Vector

----

## Dataset Preprocessing

### Importing the Dataset

In [1]:
import pickle
[lib, con, neutral] = pickle.load(open('ibcData.pkl', 'rb'))

lib = [sentence.get_words() for sentence in lib]
con = [sentence.get_words() for sentence in con]

reviews = lib + con
positions = [0]*len(lib) + [1]*len(con)

### Library Imports

In [2]:
# Default Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

# Word Tokenizing Imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem.porter import PorterStemmer

# Model Evaluation Imports
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# 'stopwords' Library Import
from re import sub
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Creating the Word2Vec

In [3]:
# Initializing Stopwords list
ignored_words = set(stopwords.words('english'))
ignored_words.remove('not')

# Initializing the tokenizing classes.
stemmer = PorterStemmer()
tokenizer = Tokenizer(oov_token = '<OOV>')
corpus = []

for review in reviews:
    # Standardizing character range to a-z.
    review = review.lower()
    review = sub('[^a-z]', ' ', review)

    # Removing less valuable words: the 'stopwords'
    review = review.split()
    review = [stemmer.stem(word) for word in review if not word in ignored_words]
    review = ' '.join(review)
    
    corpus.append(review)


tokenizer.fit_on_texts(corpus)
X = tokenizer.texts_to_sequences(corpus)
X = pad_sequences(X, padding='post')
y = positions

### Creating Training & Testing Values

In [4]:
X = np.array(X)
y = np.array(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

----

## Running ML Models (Reference)

### Gaussian Naive Bayes

In [5]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[  1 322]
 [  3 233]]
0.4186046511627907
0.5891276864728192


### K-Neighbors

In [6]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[216 107]
 [123 113]]
0.5885509838998211
0.4956140350877193


### Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[279  44]
 [ 97 139]]
0.7477638640429338
0.6634844868735084


### Suport Vector Machine

In [8]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[267  56]
 [142  94]]
0.6457960644007156
0.48704663212435234


### Extreme Gradient Boosting

In [9]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 300, random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[305  18]
 [ 80 156]]
0.8246869409660107
0.7609756097560976


**Model With Best Accuracy:** XGBoost at 82%