# Model Classification: Bag of Words

----

## Dataset Preprocessing

### Importing the Dataset

In [1]:
import pickle
[lib, con, neutral] = pickle.load(open('ibcData.pkl', 'rb'))

lib = [sentence.get_words() for sentence in lib]
con = [sentence.get_words() for sentence in con]

reviews = lib + con
positions = [0]*len(lib) + [1]*len(con)

### Library Imports

In [2]:
# Default Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Word Tokenizing Imports
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Model Evaluation Imports
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


# 'stopwords' Library Import
from re import sub
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Creating the Bag of Words

In [3]:
# Initializing Stopwords list
ignored_words = set(stopwords.words('english'))
ignored_words.remove('not')

# Initializing the tokenizing classes.
stemmer = PorterStemmer()
vectorizer = CountVectorizer()
corpus = []

for review in reviews:
    # Standardizing character range to a-z.
    review = review.lower()
    review = sub('[^a-z]', ' ', review)

    # Removing less valuable words: the 'stopwords'
    review = review.split()
    review = [stemmer.stem(word) for word in review if not word in ignored_words]
    review = ' '.join(review)
    
    corpus.append(review)

# Creating X and y
X = vectorizer.fit_transform(corpus).toarray()
y = positions

### Creating Training & Testing Values

In [4]:
X = np.array(X)
y = np.array(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

----

## Running ML Models (Reference)

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[130 193]
 [ 79 157]]
0.5134168157423972
0.5358361774744027


### K-Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[274  49]
 [185  51]]
0.5813953488372093
0.3035714285714286


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[235  88]
 [128 108]]
0.6135957066189625
0.4999999999999999


### Suport Vector Machine

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[247  76]
 [103 133]]
0.6797853309481217
0.597752808988764


### Extreme Gradient Boosting

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 300, random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

[[241  82]
 [117 119]]
0.6440071556350626
0.5446224256292905


**Model With Best Accuracy:** SVM at 68%