# Hotel Review Rating Prediction

In [None]:
import glob, os
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from sklearn import preprocessing
import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt 
import time
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from wordcloud import WordCloud,STOPWORDS
matplotlib.style.use('ggplot')
%matplotlib inline 

import nltk
import string
from nltk import word_tokenize
from nltk.util import bigrams, trigrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
#nltk.download() # download the english stopwords corpus and the punkt package and maybe the porter stemmer if not present

pd.set_option('display.max_columns', 36)
print(pd.__version__)

In [None]:
review_dfy = review_dfx[review_dfx['rati'] == 1]
print("Rating = 1")
wordcloud_draw(preproc(review_dfy['body']),'white')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import *
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score
from sklearn import cross_validation, metrics

In [None]:
review_dfz = review_dfx[['rati', 'body']]
for idx in range(5):
    print(review_dfz.body[idx])
    print("++++++++++++++++++++++++++++++++++++++++\n")    
print("\n**************************************************************\n")

In [None]:
#Some words (e.g. no, not, more, most etc.) have been removed from the standard stopwords available in NLTK. 
#It’s done so because those words can have some sentiment impact in our review dataset.
customised_stopwords = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only',
                                                            'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))
def remove_stopwords(s):
    token_list = nltk.word_tokenize(s)
    exclude_stopwords = lambda token : token not in customised_stopwords
    return ' '.join(filter(exclude_stopwords, token_list))

def stem_token_list(token_list):
    STEMMER = PorterStemmer()
    return [STEMMER.stem(tok) for tok in token_list]

def restring_tokens(token_list):
    return ' '.join(token_list)

def preprocess(s):
    for ch in string.punctuation:  s = s.replace(ch, "") 
    for dg in string.digits:  s = s.replace(dg, "") 
    s = s.lower()
    
    s = remove_stopwords(s)
    token_list = nltk.word_tokenize(s)
    token_list = stem_token_list(token_list)
    return restring_tokens(token_list)

In [None]:
review_dfz['body'] = review_dfz['body'].apply(preprocess)

for idx in range(5):
    print(review_dfz.body[idx])
    print("++++++++++++++++++++++++++++++++++++++++++++++++\n")
print("\n**************************************************************\n")

In [None]:
print(review_dfz.shape)
review_dfz.head(5)

In [None]:
#Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(review_dfz.body, review_dfz.rati, test_size=0.3, random_state=123)

In [None]:
bag_of_words_vectorizer = CountVectorizer(analyzer = "word",
                                          tokenizer = None,    
                                          preprocessor = None,
                                          ngram_range = (1, 1),
                                          binary = False,
                                          strip_accents='unicode')

bigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (2, 2),
                                    strip_accents='unicode')

trigram_vectorizer = CountVectorizer(analyzer = "word",
                                     tokenizer = None,
                                     preprocessor = None,
                                     ngram_range = (3, 3),
                                     strip_accents='unicode')

bi_and_trigram_vectorizer = CountVectorizer(analyzer = "word",
                                            tokenizer = None,
                                            preprocessor = None,
                                            ngram_range = (2,3),
                                            strip_accents='unicode')

random_forest_vectorizer = CountVectorizer(analyzer = "word",
                                           tokenizer = None,
                                           preprocessor = None,
                                           ngram_range = (1,1),
                                           strip_accents = 'unicode',
                                           max_features = 1000)

In [None]:
#print(bag_of_words_vectorizer.vocabulary_), print("++++++++++++++++++++++++++++++++++++++++++++++++\n")
def get_range(d, begin, end):
    result = {}
    for (key,value) in d.iteritems():
        if key >= begin and key <= end:
            result[key] = value
    print(result)
    print("++++++++++++++++++++++++++++++++++++++++++++++++\n")
    
get_range(bigram_vectorizer.vocabulary_, 0, 9)

#print(trigram_vectorizer.vocabulary_), print("++++++++++++++++++++++++++++++++++++++++++++++++\n")

#print(bi_and_trigram_vectorizer.vocabulary_), print("++++++++++++++++++++++++++++++++++++++++++++++++\n")

#print(random_forest_vectorizer.vocabulary_), print("++++++++++++++++++++++++++++++++++++++++++++++++\n")

In [None]:
def make_confusion_matrix_relative(confusion_matrix):
    star_category_classes = [1, 2, 3, 4, 5]
    N = list(map(lambda clazz : sum(Y_test == clazz), star_category_classes))
    relative_confusion_matrix = np.empty((len(star_category_classes), len(star_category_classes)))
    
    for j in range(0, len(star_category_classes)):
        if N[j] > 0:
            relative_frequency = confusion_matrix[j, :] / float(N[j])
            relative_confusion_matrix[j, :] = relative_frequency
            
    return relative_confusion_matrix

# http://www.wenda.io/questions/4330313/heatmap-with-text-in-each-cell-with-matplotlibs-pyplot.html
# http://stackoverflow.com/questions/20520246/create-heatmap-using-pandas-timeseries
# http://sebastianraschka.com/Articles/heatmaps_in_r.html
# http://code.activestate.com/recipes/578175-hierarchical-clustering-heatmap-python/
def plot_confusion_matrix(confusion_matrix=[[]], title='CM', savefilename=''):
    rcm = make_confusion_matrix_relative(confusion_matrix)
    #plt.imshow(rcm, vmin=0, vmax=1, interpolation='nearest')
    c = plt.pcolor(rcm, edgecolors='k', linewidths=4, cmap='jet', vmin=0.0, vmax=1.0)
    plt.title(title)
    plt.colorbar()
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.xticks(0.5 + np.arange(5), np.arange(1,6))
    plt.yticks(0.5 + np.arange(5), np.arange(1,6))

    def show_values(pc, fmt="%.2f", **kw):
        #from itertools import zip
        pc.update_scalarmappable()
        ax = pc.get_axes()
        for p, color, value in zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
            x, y = p.vertices[:-2, :].mean(0)
            if sum(color[:2] > 0.3) >= 2:
                color = (0.0, 0.0, 0.0)
            else:
                color = (1.0, 1.0, 1.0)
            ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)
    
    show_values(c)

    if savefilename:
        plt.savefig(savefilename, bbox_inches='tight')
    
    return plt.show()


def print_classifier_performance_metrics(name, predictions):
    target_names = ['1 star', '2 star', '3 star', '4 star', '5 star']
    
    print ("MODEL: %s" % name)
    print ()

    print ('Precision: ' + str(metrics.precision_score(Y_test, predictions, average='micro')))
    print ('Recall: ' + str(metrics.recall_score(Y_test, predictions, average='micro')))
    print ('F1: ' + str(metrics.f1_score(Y_test, predictions,  average='micro')))
    print ('Accuracy: ' + str(metrics.accuracy_score(Y_test, predictions)))

    print()
    print ('Classification Report:')
    print (classification_report(Y_test, predictions, target_names=target_names))
    
    print()
    print ('Precision variance: %f' % np.var(precision_score(Y_test, predictions, average=None), ddof=len(target_names)-1))
    
    print()
    print ('Recall variance: %f' % np.var(recall_score(Y_test, predictions, average=None), ddof=len(target_names)-1))

In [None]:
bow_feature_matrix_train = bag_of_words_vectorizer.fit_transform(X_train)
bow_feature_matrix_test = bag_of_words_vectorizer.transform(X_test)
bow_feature_matrix_train, bow_feature_matrix_test

In [None]:
multinomial_nb_classifier = MultinomialNB()
multinomial_nb_classifier.fit(bow_feature_matrix_train, Y_train)
multinomial_nb_prediction = multinomial_nb_classifier.predict(bow_feature_matrix_test)

In [None]:
multinomial_confusion_matrix = confusion_matrix(Y_test, multinomial_nb_prediction)
print (make_confusion_matrix_relative(multinomial_confusion_matrix))
plot_confusion_matrix(multinomial_confusion_matrix, 'Multinomial Naive Bayes Confusion Matrix', savefilename='MultinomialCM.png')

In [None]:
print_classifier_performance_metrics('Multinomial Naive Bayes', multinomial_nb_prediction)

In [None]:
bigram_multinomial_feature_matrix_train = bigram_vectorizer.fit_transform(X_train)
bigram_multinomial_feature_matrix_test = bigram_vectorizer.transform(X_test)
#bigram_multinomial_feature_matrix_train, bigram_multinomial_feature_matrix_test

#Make predictions with Trigram Multinomial NB
bigram_multinomial_nb_classifier = MultinomialNB().fit(bigram_multinomial_feature_matrix_train, Y_train)
bigram_multinomial_nb_prediction = bigram_multinomial_nb_classifier.predict(bigram_multinomial_feature_matrix_test)

#Visualize through confusion matrix
bigram_multinomial_confusion_matrix = confusion_matrix(Y_test, bigram_multinomial_nb_prediction)
plot_confusion_matrix(bigram_multinomial_confusion_matrix, 'Bigram Multinomial Naive Bayes Confusion Matrix', savefilename='BigramMultinomialCM.png')


In [None]:
print_classifier_performance_metrics('Bigram Multinomial Naive Bayes', bigram_multinomial_nb_prediction)

# Trigram Naive Bayes Model

## Transform Yelp reviews into feature vectorizers by counting bigram occurrences


In [None]:
trigram_multinomial_feature_matrix_train = trigram_vectorizer.fit_transform(X_train)
trigram_multinomial_feature_matrix_test = trigram_vectorizer.transform(X_test)
#trigram_multinomial_feature_matrix_train, trigram_multinomial_feature_matrix_test

#Make predictions with Trigram Multinomial NB
tri_gram_multinomial_nb_classifier = MultinomialNB().fit(trigram_multinomial_feature_matrix_train, Y_train)
tri_gram_multinomial_nb_prediction = tri_gram_multinomial_nb_classifier.predict(trigram_multinomial_feature_matrix_test)

#Visualize through confusion matrix
trigram_multinomial_confusion_matrix = confusion_matrix(Y_test, tri_gram_multinomial_nb_prediction)
plot_confusion_matrix(trigram_multinomial_confusion_matrix, 'Trigram Multinomial Naive Bayes Confusion Matrix', savefilename='TrigramMultinomialCM.png')


In [None]:
print_classifier_performance_metrics('Trigram Multinomial Naive Bayes', tri_gram_multinomial_nb_prediction)

# Random Forest 100 Learners Model

In [None]:
forest100 = RandomForestClassifier(n_estimators = 100, random_state=42)

#Transform Yelp reviews into feature vectors
random_forest_feature_matrix_train = random_forest_vectorizer.fit_transform(X_train)
random_forest_feature_matrix_test = random_forest_vectorizer.transform(X_test)

#Make predictions with random forest set at 100 learners
%time forest100.fit(random_forest_feature_matrix_train.toarray(), Y_train)
forest100_pred = forest100.predict(random_forest_feature_matrix_test.toarray())
np.save('forest100pred', forest100_pred)

#Visualize results in confusion matrix
random_forest_confusion_matrix = confusion_matrix(Y_test, forest100_pred)
plot_confusion_matrix(random_forest_confusion_matrix, 'Random Forest (100 Learners) Confusion Matrix', savefilename='RandomForestCM.png')


In [None]:
print_classifier_performance_metrics('Random Forest (100 Learners)', forest100_pred)