## San Francisco Restaurant Reviews Machine Learning Stage
## XGBoost Classifier
### Darren Lyles

<p>
    Here, I will be using the XGBoost model on restaurant reviews and their 
    corresponding ratings for restaurants in San Francisco.
</p>

In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_restaurant_reviews = pd.read_csv('restaurant_reviews_tokenized.csv')
df_restaurant_reviews.head()

Unnamed: 0.1,Unnamed: 0,name,cuisine,address,locality,region,hours,email,tel,fax,...,latitude,longitude,price,rating,review_url,review_title,review_text,tokenized_review_text,review_rating,review_date
0,0,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Great drinks and food,They have great local craft beers and probably...,"['great', 'local', 'craft', 'beer', 'probably'...",4,"Mar 28, 2016 12:00:00 AM"
1,1,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Good food &amp; beer,We went to the downtown SF location. The resta...,"['went', 'downtown', 'sf', 'location', 'restau...",4,"Mar 27, 2016 12:00:00 AM"
2,2,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Pretty good beers,I just came to this place for drinks with an o...,"['came', 'place', 'drink', 'old', 'colleague',...",4,"Mar 16, 2016 12:00:00 AM"
3,3,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Ridiculously overpriced (yes I live in SF),"Mediocre food (not bad, just mediocre, you can...","['mediocre', 'food', 'bad', 'mediocre', 'find'...",3,"Mar 8, 2016 12:00:00 AM"
4,4,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Team dinner,We headed out for our team dinner to this esta...,"['headed', 'team', 'dinner', 'establishment', ...",4,"Mar 1, 2016 12:00:00 AM"


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

<p> 
    The following block of code will be used to plot the confusion matrices for the Random Forest 
    Classifier.
</p>

In [4]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

## XGBoost using Bag of Words

<p> The following script here is subject to change. <br/>
    The function definition below executes the following steps:
    <ol>
        <li>Vectorizes the restaurants reviews</li>
        <li>One hot encodes the restaurant ratings</li>
        <li>Executes cross validation on the model while recording its runtime</li>
        <li>Prints the accuracy scores for 5-fold cross validation</li>
        <li>Gets predicted ratings</li>
        <li>Returns a tuple consisting of
            <ul>
                <li>runtime</li>
                <li>5-fold cross validation scores</li>
                <li>actual restaurant review ratings</li>
                <li>predicted restaurant review ratings</li>
            </ul>   
        </li>
    </ol>
    The values returned will be used to analyze the performance
    of the model.
</p>

In [5]:
import xgboost as xgb

def xgb_classifier(X, y, vect_func, n_gram):
    
    #1. Vectorize the restaurant reviews
    vect = vect_func(ngram_range=n_gram)
    review_x = vect.fit_transform(X)
    
    #2. Label encodes all possible restaurant ratings
    le = LabelEncoder()
    label = le.fit_transform(y)
    
    #3. One hot encodes restaurant ratings
    ohe = OneHotEncoder()
    review_y = ohe.fit_transform(label.reshape(-1,1)).toarray()
    review_y = np.argmax(review_y, axis=1)
    
    #4. Executing 5-fold cross validation and recording runtime
    classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=5)
    
    start = time.time()
    scores = cross_val_score(classifier, review_x, review_y, cv=5)
    end   = time.time()
    
    cv_runtime = round(end - start, 2)
    
    #5. Printing CV scores and corresponding runtime
    print("XGB Classifier Accuracy: {}".format(scores))
    print("Runtime: {}".format(cv_runtime))
    
    #6. Get restaurant ratings predicted by the model
    y_pred = cross_val_predict(classifier, review_x, review_y, cv=5)
    
    #7. Return runtime, cv scores, actual rating, predicted rating
    return (cv_runtime, scores, review_y, y_pred)

## XGBoost Classifier Using CountVectorizer

### 1-Gram

In [6]:
time_1gram, scores_1gram, y_1gram, y_pred_1gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,1))

XGB Classifier Accuracy: [0.57493188 0.58333333 0.6        0.59987875 0.5755003 ]
Runtime: 187.38


### 2-Gram

In [None]:
time_2gram, scores_2gram, y_2gram, y_pred_2gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,2))

XGB Classifier Accuracy: [0.5849228  0.58030303 0.59424242 0.60351622 0.57913887]
Runtime: 3210.29


### 3-Gram

In [None]:
time_3gram, scores_3gram, y_3gram, y_pred_3gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,3))

### Confusion Matrices for Model with respect to n-Gram

In [None]:
class_names = ['1', '2', '3', '4', '5']

plt.subplots(figsize=(15, 8))
plt.subplot(2, 3, 1)
plot_confusion_matrix(confusion_matrix(y_1gram, y_pred_1gram), classes=class_names, title='XGB 1-gram')
plt.subplot(2, 3, 2)
plot_confusion_matrix(confusion_matrix(y_2gram, y_pred_2gram), classes=class_names, title='XGB 2-gram')
plt.subplot(2, 3, 3)
plot_confusion_matrix(confusion_matrix(y_3gram, y_pred_3gram), classes=class_names, title='XGB 3-gram')

In [None]:
runtimes = [time_1gram, time_2gram, time_3gram]
accuracy = [scores_1gram.mean(), scores_2gram.mean(), scores_3gram.mean()]
index    = ['1-gram', '2-gram', '3-gram']

results  = pd.DataFrame({'Runtime (s)': runtimes,
                        'Accuracy': accuracy},
                         index=index)
results

In [None]:
plt.subplots(figsize=(15, 5))

n_gram = [1, 2, 3]
plt.subplot(1, 2, 1)
_ = plt.plot(n_gram, runtimes, linestyle='-.', linewidth=3)
_ = plt.xticks(n_gram)
_ = plt.title('XGB Runtime for 5-fold Cross Validation', fontsize=18)
_ = plt.xlabel('n-Gram', fontsize=20)
_ = plt.ylabel('Time (s)', fontsize=20)

plt.subplot(1, 2, 2)
_ = plt.plot(n_gram, accuracy, linestyle='-.', linewidth=3)
_ = plt.xticks(n_gram)
_ = plt.title('XGB Mean Accuracy for 5-fold Cross Validation', fontsize=18)
_ = plt.xlabel('n-Gram', fontsize=20)
_ = plt.ylabel('Accuracy', fontsize=20)

plt.show()

## XGBoost Classifier Using TF-IDF Vectorization

### 1-Gram

In [None]:
time_1gram_tfidf, scores_1gram_tfidf, \
y_1gram_tfidf, y_pred_1gram_tfidf = xgb_classifier(df_restaurant_reviews['review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 TfidfVectorizer, (1,1))

### 2-Gram

In [None]:
time_2gram_tfidf, scores_2gram_tfidf, \
y_2gram_tfidf, y_pred_2gram_tfidf = xgb_classifier(df_restaurant_reviews['review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 TfidfVectorizer, (1,2))

### 3-Gram

In [None]:
time_3gram_tfidf, scores_3gram_tfidf, \
y_3gram_tfidf, y_pred_3gram_tfidf = xgb_classifier(df_restaurant_reviews['review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 TfidfVectorizer, (1,3))

### Confusion Matrices for Model with respect to n-Gram

In [None]:
plt.subplots(figsize=(15, 8))
plt.subplot(2, 3, 1)
plot_confusion_matrix(confusion_matrix(y_1gram_tfidf, y_pred_1gram_tfidf), classes=class_names, title='XGB 1-gram')
plt.subplot(2, 3, 2)
plot_confusion_matrix(confusion_matrix(y_2gram_tfidf, y_pred_2gram_tfidf), classes=class_names, title='XGB 2-gram')
plt.subplot(2, 3, 3)
plot_confusion_matrix(confusion_matrix(y_3gram_tfidf, y_pred_3gram_tfidf), classes=class_names, title='XGB 3-gram')

In [None]:
runtimes_tfidf = [time_1gram_tfidf, time_2gram_tfidf, time_3gram_tfidf, time_4gram_tfidf, time_5gram_tfidf]
accuracy_tfidf = [scores_1gram_tfidf.mean(), scores_2gram_tfidf.mean(), scores_3gram_tfidf.mean(), \
                  scores_4gram_tfidf.mean(), scores_5gram_tfidf.mean()]
index_tfidf    = ['1-gram', '2-gram', '3-gram', '4-gram', '5-gram']

results_tfidf  = pd.DataFrame({'Runtime (s)': runtimes_tfidf,
                        'Accuracy': accuracy_tfidf},
                         index=index_tfidf)
results_tfidf

In [None]:
plt.subplots(figsize=(15, 5))

plt.subplot(1, 2, 1)
_ = plt.plot(n_gram, runtimes_tfidf, linestyle='-.', linewidth=3)
_ = plt.xticks(n_gram)
_ = plt.title('XGB Runtime for 5-fold Cross Validation', fontsize=18)
_ = plt.xlabel('n-Gram', fontsize=20)
_ = plt.ylabel('Time (s)', fontsize=20)

plt.subplot(1, 2, 2)
_ = plt.plot(n_gram, accuracy_tfidf, linestyle='-.', linewidth=3)
_ = plt.xticks(n_gram)
_ = plt.title('XGB Mean Accuracy for 5-fold Cross Validation', fontsize=18)
_ = plt.xlabel('n-Gram', fontsize=20)
_ = plt.ylabel('Accuracy', fontsize=20)

plt.show()

<p> The best performing XGBoost Classifier model has an accuracy of _____
    This model uses the ___Vectorizer with bigram parameter input.
    We will use this model and apply it to our dataset.
</p>

In [None]:
def model_input_preprocessing(X, y, vect_func, n_gram):
    
    #1. Vectorize the restaurant reviews
    vect = vect_func(ngram_range=n_gram)
    review_x = vect.fit_transform(X)
    
    #2. Label encodes all possible restaurant ratings
    le = LabelEncoder()
    label = le.fit_transform(y)
    
    #3. One hot encodes restaurant ratings
    ohe = OneHotEncoder()
    review_y = ohe.fit_transform(label.reshape(-1,1)).toarray()
    review_y = np.argmax(review_y, axis=1)
     
    return review_x, review_y, vect, le

In [None]:
X, y, x_enc, y_enc = model_input_preprocessing(df_restaurant_reviews['tokenized_review_text'], 
                                 df_restaurant_reviews['review_rating'], 
                                 CountVectorizer, (1,2))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

model = RandomForestClassifier()
start = time.time()
model.fit(X_train, y_train)
end   = time.time()
y_pred = model.predict(X_test)

print('The accuracy score of this model is {}'.format(accuracy_score(y_test, y_pred)))
print('Runtime {}s'.format(end-start))

In [None]:
plt.subplots(figsize=(10, 5))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes=class_names, title='EDIT TITLE')

## Word Cloud Visualization of Model

<p>Before Creating the word cloud, first we will reformat the    <br/>
   restaurant reviews test data such that all words from 5-star and <br/>
   1-star reviews will be consolidated into lists.</p>

### Data Processing Stage

In [None]:
#Multidimensional numpy array format to list
X_unvectorized   = x_enc.inverse_transform(X_test)
X_unvectorized   = [x.tolist() for x in X_unvectorized]

#Converting numpy array to list
y_unvectorized   = y_enc.inverse_transform(y_pred)
y_unvectorized   = y_unvectorized.tolist()

Xy_unvectorized  = list(zip(X_unvectorized, y_unvectorized))

In [None]:
#Filter out 5-star and 1-star reviews
filter_5star = list(filter(lambda x: x[1] == 5, Xy_unvectorized))
filter_1star = list(filter(lambda x: x[1] == 1, Xy_unvectorized))

In [None]:
#Store 5-star and 1-star reviews without rating
five_star_reviews, dummy = list(zip(*filter_5star))
one_star_reviews, dummy = list(zip(*filter_1star))

In [None]:
##Convert to lists of lists and flatten list


five_star_flattened = list(five_star_reviews)
five_star_flattened = [val for sublist in five_star_reviews for val in sublist]

one_star_flattened = list(one_star_reviews)
one_star_flattened =  [val for sublist in one_star_reviews for val in sublist]

In [None]:
#Convert Lists into dictionaries with word as the key and frequency as value
import collections

five_star_dict = dict(collections.Counter(five_star_flattened).most_common())
one_star_dict = dict(collections.Counter(one_star_flattened).most_common())

### Word Cloud Visualizaiton Stage

In [None]:
from wordcloud import WordCloud
from PIL import Image

def transform_format(val):
    if val == 26:  #color yellow and blue contrast
        return 255
    else:
        return 0

In [None]:
wc_mask = np.array(Image.open('waiter.png'))
transformed_mask = np.ndarray((wc_mask.shape[0], wc_mask.shape[1]), np.int32)

for i in range(len(wc_mask)):
    transformed_mask[i] = list(map(transform_format, wc_mask[i]))

In [None]:
wc_5star = WordCloud(background_color='white', 
                     max_words=1000, mask=transformed_mask).generate_from_frequencies(five_star_dict)

plt.figure(figsize=(20, 13))
plt.imshow(wc_5star, interpolation='bilinear')
plt.axis('off')
plt.title('Words commonly found in 5-Star Restaurant Reviews', fontsize=30)
plt.savefig('xgboost_5star.png', format='png')
plt.show()

In [None]:
wc_1star = WordCloud(background_color='white', 
                     max_words=1000, mask=transformed_mask).generate_from_frequencies(one_star_dict)

plt.figure(figsize=(20, 13))
plt.imshow(wc_1star, interpolation='bilinear')
plt.axis('off')
plt.title('Words commonly found in 1-Star Restaurant Reviews', fontsize=30)
plt.savefig('xgboost_1star.png', format='png')
plt.show()