In [1]:
# Importing the libraries
import numpy
import pandas
import nltk
import ssl
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")

# Importing the dataset

print('Ingesting the data...\n')
train = pandas.read_csv('data/train.txt', delimiter = '\t', header = None, quoting = 3)
train.columns = ['Review', 'Rating']
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('all', quiet = True)

print('Creating the bag of words...\n')

corpus = []
for i in range(0, train.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', train['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model using CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = train.iloc[:, 1].values

clf_01 = KNeighborsClassifier()
clf_02 = RandomForestClassifier()
clf_03 = GaussianNB()
clf_04 = BernoulliNB(alpha=0.8)
clf_05 = MultinomialNB(alpha=0.1)
clf_06 = LogisticRegression(C=1.5)
clf_07 = DecisionTreeClassifier()
clf_08 = SVC(kernel="linear")
lr = LogisticRegression()
sclf = StackingClassifier(classifiers = [clf_01, clf_02, clf_03],
                         meta_classifier = lr,
                         use_probas = True,
                         average_probas = False)

print('Performing 5-fold cross validation modelling...\n')

results = []

for clf, label in zip([clf_01, clf_02, clf_03, clf_04, clf_05, clf_06, clf_07, clf_08, sclf],
                     ['KNN',
                     'Random Forest',
                     'Gaussian Naive Bayes',
                     'Bernoulli Naive Bayes',
                     'Multinomial Naive Bayes',
                     'Logistic Regression',
                     'Decision Tree Classifier',
                     'Support Vector Machine',
                     'Stacked Classifier']):
    scores = model_selection.cross_val_score(clf, X, y, cv = 5, scoring = 'accuracy')
    print('Accuracy: %0.6f (+/- %0.2f) [%s]'
         % (scores.mean(), scores.std(), label))
    results.append([label, scores.mean(), scores.std(), clf])

def maximum_accuracy(sequence):
    if not sequence:
        raise ValueError('empty sequence')

    maximum = sequence[0]

    for item in sequence:
        # Compare elements by their weight stored
        # in their second element.
        if item[1] > maximum[1]:
            maximum = item

    return maximum

best_model = maximum_accuracy(results)
best_model[3].fit(X, y)
print('\nThe optimal model was the ' + best_model[0] + ' with an accuracy of ' + str(round(best_model[1], 2)))
print('\n')
print('Ingesting the test data...\n')
test = pandas.read_csv('data/test.txt', delimiter = '\t', header = None, quoting = 3)
test.columns = ['Review']

print('Applying the bag of words...\n')

corpus = []
for i in range(0, test.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', train['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model using CountVectorizer

X_test = cv.transform(corpus).toarray()
test['Predictions'] = best_model[3].predict(X_test)
test['Confidence of Positive (1)'] = [i[1] for i in best_model[3].predict_proba(X_test)]
print(test)
test.to_csv('data/test_predictions.txt', index=None, sep='\t')
print('\nResults written out to data/test_predictions.txt')

In [2]:
# Importing the dataset

print('Ingesting the data...\n')
train = pandas.read_csv('data/train.txt', delimiter = '\t', header = None, quoting = 3)
train.columns = ['Review', 'Rating']
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('all', quiet = True)

print('Creating the bag of words...\n')

corpus = []
for i in range(0, train.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', train['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model using CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = train.iloc[:, 1].values

clf_01 = KNeighborsClassifier()
clf_02 = RandomForestClassifier()
clf_03 = GaussianNB()
clf_04 = BernoulliNB(alpha=0.8)
clf_05 = MultinomialNB(alpha=0.1)
clf_06 = LogisticRegression(C=1.5)
clf_07 = DecisionTreeClassifier()
clf_08 = SVC(kernel="linear")
lr = LogisticRegression()
sclf = StackingClassifier(classifiers = [clf_01, clf_02, clf_03],
                         meta_classifier = lr,
                         use_probas = True,
                         average_probas = False)

print('Performing 5-fold cross validation modelling...\n')

results = []

for clf, label in zip([clf_01, clf_02, clf_03, clf_04, clf_05, clf_06, clf_07, clf_08, sclf],
                     ['KNN',
                     'Random Forest',
                     'Gaussian Naive Bayes',
                     'Bernoulli Naive Bayes',
                     'Multinomial Naive Bayes',
                     'Logistic Regression',
                     'Decision Tree Classifier',
                     'Support Vector Machine',
                     'Stacked Classifier']):
    scores = model_selection.cross_val_score(clf, X, y, cv = 5, scoring = 'accuracy')
    print('Accuracy: %0.6f (+/- %0.2f) [%s]'
         % (scores.mean(), scores.std(), label))
    results.append([label, scores.mean(), scores.std(), clf])

def maximum_accuracy(sequence):
    if not sequence:
        raise ValueError('empty sequence')

    maximum = sequence[0]

    for item in sequence:
        # Compare elements by their weight stored
        # in their second element.
        if item[1] > maximum[1]:
            maximum = item

    return maximum

best_model = maximum_accuracy(results)
best_model[3].fit(X, y)
print('\nThe optimal model was the ' + best_model[0] + ' with an accuracy of ' + str(round(best_model[1], 2)))
print('\n')
print('Ingesting the test data...\n')
test = pandas.read_csv('data/test.txt', delimiter = '\t', header = None, quoting = 3)
test.columns = ['Review']

print('Applying the bag of words...\n')

corpus = []
for i in range(0, test.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', train['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model using CountVectorizer

X_test = cv.transform(corpus).toarray()
test['Predictions'] = best_model[3].predict(X_test)
test['Confidence of Positive (1)'] = [i[1] for i in best_model[3].predict_proba(X_test)]
print(test)
test.to_csv('data/test_predictions.txt', index=None, sep='\t')
print('\nResults written out to data/test_predictions.txt')

Ingesting the data...

Creating the bag of words...

Performing 5-fold cross validation modelling...

Accuracy: 0.671517 (+/- 0.04) [KNN]
Accuracy: 0.745710 (+/- 0.05) [Random Forest]
Accuracy: 0.676704 (+/- 0.02) [Gaussian Naive Bayes]
Accuracy: 0.765448 (+/- 0.02) [Bernoulli Naive Bayes]
Accuracy: 0.757054 (+/- 0.02) [Multinomial Naive Bayes]
Accuracy: 0.771736 (+/- 0.03) [Logistic Regression]
Accuracy: 0.706039 (+/- 0.04) [Decision Tree Classifier]
Accuracy: 0.757201 (+/- 0.04) [Support Vector Machine]
Accuracy: 0.761259 (+/- 0.03) [Stacked Classifier]

The optimal model was the Logistic Regression with an accuracy of 0.77


Ingesting the test data...

Applying the bag of words...

                                               Review  Predictions  \
0   There was a warm feeling with the service and ...            1   
1   An extensive menu provides lots of options for...            1   
2   I always order from the vegetarian menu during...            0   
3   I have watched their p

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
print('Ingesting the test data...\n')
test = pandas.read_csv('data/test.txt', delimiter = '\t', header = None, quoting = 3)
test.columns = ['Review']

print('Applying the bag of words...\n')

corpus = []
for i in range(0, test.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', train['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model using CountVectorizer

X_test = cv.transform(corpus).toarray()
test['Predictions'] = best_model[3].predict(X_test)
test['Confidence of Positive (1)'] = [i[1] for i in best_model[3].predict_proba(X_test)]
print(test)

Ingesting the test data...

Applying the bag of words...

                                               Review  Predictions  \
0   There was a warm feeling with the service and ...            0   
1   An extensive menu provides lots of options for...            1   
2   I always order from the vegetarian menu during...            1   
3   I have watched their prices inflate, portions ...            0   
4   Wonderful lil tapas and the ambience made me f...            1   
5   I got to enjoy the seafood salad, with a fabul...            0   
6   The wontons were thin, not thick and chewy, al...            1   
7   Level 5 spicy was perfect, where spice didn't ...            1   
8   We were sat right on time and our server from ...            1   
9   Main thing I didn't enjoy is that the crowd is...            1   
10  When I'm on this side of town, this will defin...            1   
11  I had to wait over 30 minutes to get my drink ...            0   
12                      This is 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools

gs = gridspec.GridSpec(3, 3)

fig = plt.figure(figsize = (10, 8))

for clf, lab, grd in zip([clf_01, clf_02, clf_03, clf_04, clf_05, clf_06, clf_07, clf_08, sclf],
                        ['KNN',
                        'Random Forest',
                        'Gaussian Naive Bayes',
                        'Bernoulli Naive Bayes',
                        'Multinomial Naive Bayes',
                        'Logistic Regression',
                        'Decision Tree Classifier',
                        'Support Vector Machine',
                        'Stacked Classifier'],
                        itertools.product([0, 1], repeat = 2)):
    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X = X, y = y, clf = clf)
    plt.title(lab)

In [None]:
# prepare configuration for cross validation test harness
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()