## San Francisco Restaurant Reviews Machine Learning Stage
## XGBoost Classifier
### Darren Lyles

<p>
    Here, I will be using the XGBoost model on restaurant reviews and their 
    corresponding ratings for restaurants in San Francisco.
</p>

In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_restaurant_reviews = pd.read_csv('restaurant_reviews_tokenized.csv')
df_restaurant_reviews.head()

Unnamed: 0.1,Unnamed: 0,name,cuisine,address,locality,region,hours,email,tel,fax,...,latitude,longitude,price,rating,review_url,review_title,review_text,tokenized_review_text,review_rating,review_date
0,0,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Great drinks and food,They have great local craft beers and probably...,"['great', 'local', 'craft', 'beer', 'probably'...",4,"Mar 28, 2016 12:00:00 AM"
1,1,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Good food &amp; beer,We went to the downtown SF location. The resta...,"['went', 'downtown', 'sf', 'location', 'restau...",4,"Mar 27, 2016 12:00:00 AM"
2,2,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Pretty good beers,I just came to this place for drinks with an o...,"['came', 'place', 'drink', 'old', 'colleague',...",4,"Mar 16, 2016 12:00:00 AM"
3,3,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Ridiculously overpriced (yes I live in SF),"Mediocre food (not bad, just mediocre, you can...","['mediocre', 'food', 'bad', 'mediocre', 'find'...",3,"Mar 8, 2016 12:00:00 AM"
4,4,21st Amendment Brewery & Restaurant,"['Cafe', 'Pub Food', 'American', 'Burgers', 'P...",563 2nd St,San Francisco,CA,"{'monday': [['11:30', '23:59']], 'tuesday': [[...",new-pub@21st-amendment.com,(415) 369-0900,(415) 369-0909,...,37.782448,-122.392576,2,4.0,https://www.tripadvisor.com/ShowUserReviews-g6...,Team dinner,We headed out for our team dinner to this esta...,"['headed', 'team', 'dinner', 'establishment', ...",4,"Mar 1, 2016 12:00:00 AM"


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

<p> 
    The following block of code will be used to plot the confusion matrices for the Random Forest 
    Classifier.
</p>

In [5]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

## XGBoost using Bag of Words

<p> The following script here is subject to change. <br/>
    The function definition below executes the following steps:
    <ol>
        <li>Vectorizes the restaurants reviews</li>
        <li>One hot encodes the restaurant ratings</li>
        <li>Executes cross validation on the model while recording its runtime</li>
        <li>Prints the accuracy scores for 5-fold cross validation</li>
        <li>Gets predicted ratings</li>
        <li>Returns a tuple consisting of
            <ul>
                <li>runtime</li>
                <li>5-fold cross validation scores</li>
                <li>actual restaurant review ratings</li>
                <li>predicted restaurant review ratings</li>
            </ul>   
        </li>
    </ol>
    The values returned will be used to analyze the performance
    of the model.
</p>

In [117]:
import xgboost as xgb

def xgb_classifier(X, y, vect_func, n_gram):
    
    #1. Vectorize the restaurant reviews
    vect = vect_func(ngram_range=n_gram)
    review_x = vect.fit_transform(X)
    
    #2. Label encodes all possible restaurant ratings
    le = LabelEncoder()
    label = le.fit_transform(y)
    
    #3. One hot encodes restaurant ratings
    ohe = OneHotEncoder()
    review_y = ohe.fit_transform(label.reshape(-1,1)).toarray()
    review_y = np.argmax(review_y, axis=1)
    
    #4. Executing 5-fold cross validation and recording runtime
    classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=5)
    
    start = time.time()
    scores = cross_val_score(classifier, review_x, review_y, cv=5)
    end   = time.time()
    
    cv_runtime = round(end - start, 2)
    
    #5. Printing CV scores and corresponding runtime
    print("XGB Classifier Accuracy: {}".format(scores))
    print("Runtime: {}".format(cv_runtime))
    
    #6. Get restaurant ratings predicted by the model
    y_pred = cross_val_predict(classifier, review_x, review_y, cv=5)
    
    #7. Return runtime, cv scores, actual rating, predicted rating
    return (cv_runtime, scores, review_y, y_pred)

## XGBoost Classifier Execution

### 1-Gram

In [118]:
time_1gram, scores_1gram, y_1gram, y_pred_1gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,1))

RF Classifier Accuracy: [0.57493188 0.58333333 0.6        0.59987875 0.5755003 ]
Runtime: 190.83


### 2-Gram

In [119]:
time_2gram, scores_2gram, y_2gram, y_pred_2gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,2))

RF Classifier Accuracy: [0.5849228  0.58030303 0.59424242 0.60351622 0.57913887]
Runtime: 2721.05


### 3-Gram

In [120]:
time_3gram, scores_3gram, y_3gram, y_pred_3gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,3))

RF Classifier Accuracy: [0.58552831 0.58121212 0.59333333 0.59836314 0.57277138]
Runtime: 5240.84


### 4-Gram

In [121]:
time_4gram, scores_4gram, y_4gram, y_pred_4gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,4))

RF Classifier Accuracy: [0.58552831 0.58121212 0.59333333 0.59836314 0.57277138]
Runtime: 57307.97


### 5-Gram

In [None]:
time_5gram, scores_5gram, y_5gram, y_pred_5gram = xgb_classifier(df_restaurant_reviews['tokenized_review_text'], 
                                                                 df_restaurant_reviews['review_rating'], 
                                                                 CountVectorizer, (1,5))

### Confusion Matrices for Model with respect to n-Gram

In [None]:
class_names = ['1', '2', '3', '4', '5']

plt.subplots(figsize=(15, 8))
plt.subplot(2, 3, 1)
plot_confusion_matrix(confusion_matrix(y_1gram, y_pred_1gram), classes=class_names, title='XGB 1-gram')
plt.subplot(2, 3, 2)
plot_confusion_matrix(confusion_matrix(y_2gram, y_pred_2gram), classes=class_names, title='XGB 2-gram')
plt.subplot(2, 3, 3)
plot_confusion_matrix(confusion_matrix(y_3gram, y_pred_3gram), classes=class_names, title='XGB 3-gram')
plt.subplot(2, 3, 4)
plot_confusion_matrix(confusion_matrix(y_4gram, y_pred_4gram), classes=class_names, title='XGB 4-gram')
plt.subplot(2, 3, 5)
plot_confusion_matrix(confusion_matrix(y_5gram, y_pred_5gram), classes=class_names, title='XGB 5-gram')

In [None]:
runtimes = [time_1gram, time_2gram, time_3gram, time_4gram, time_5gram]
accuracy = [scores_1gram.mean(), scores_2gram.mean(), scores_3gram.mean(), \
            scores_4gram.mean(), scores_5gram.mean()]
index    = ['1-gram', '2-gram', '3-gram', '4-gram', '5-gram']

results  = pd.DataFrame({'Runtime (s)': runtimes,
                        'Accuracy': accuracy},
                         index=index)
results

In [None]:
plt.subplots(figsize=(15, 5))

n_gram = [1, 2, 3, 4, 5]
plt.subplot(1, 2, 1)
_ = plt.plot(n_gram, runtimes, linestyle='-.', linewidth=3)
_ = plt.xticks(n_gram)
_ = plt.title('RF Runtime for 5-fold Cross Validation', fontsize=18)
_ = plt.xlabel('n-Gram', fontsize=20)
_ = plt.ylabel('Time (s)', fontsize=20)

plt.subplot(1, 2, 2)
_ = plt.plot(n_gram, accuracy, linestyle='-.', linewidth=3)
_ = plt.xticks(n_gram)
_ = plt.title('RF Mean Accuracy for 5-fold Cross Validation', fontsize=18)
_ = plt.xlabel('n-Gram', fontsize=20)
_ = plt.ylabel('Accuracy', fontsize=20)

plt.show()

In [112]:
cv = CountVectorizer(ngram_range=(1, 1))
review_x = cv.fit_transform(df_restaurant_reviews['tokenized_review_text'])

le = LabelEncoder()
label = le.fit_transform(df_restaurant_reviews['review_rating'])

ohe = OneHotEncoder()
review_y = ohe.fit_transform(label.reshape(-1, 1)).toarray()
review_y = np.argmax(review_y, axis=1)

In [113]:
review_y.shape

(16500,)

In [114]:
np.argmax(review_y, axis=1).shape

AxisError: axis 1 is out of bounds for array of dimension 1

In [57]:
print(review_x[0])

  (0, 5967)	1
  (0, 7419)	1
  (0, 13570)	1
  (0, 12754)	1
  (0, 12181)	1
  (0, 13190)	1
  (0, 20010)	1
  (0, 14033)	1
  (0, 4209)	1
  (0, 3083)	1
  (0, 19740)	1
  (0, 8135)	1
  (0, 13556)	1
  (0, 18638)	1
  (0, 1034)	1
  (0, 15192)	1
  (0, 13791)	1
  (0, 2233)	1
  (0, 12540)	1
  (0, 14096)	1
  (0, 2099)	1
  (0, 4715)	1
  (0, 10682)	1
  (0, 8286)	2


In [58]:
cv.inverse_transform(review_x[0])

[array(['drink', 'food', 'place', 'overall', 'night', 'paying', 'work',
        'price', 'comment', 'ca', 'well', 'good', 'pizza', 'tried', 'also',
        'rib', 'pork', 'best', 'one', 'probably', 'beer', 'craft', 'local',
        'great'], dtype='<U25')]

In [59]:
review_x.shape

(16500, 20294)

In [61]:
import xgboost as xgb

In [116]:
X_train, X_test, y_train, y_test = train_test_split(review_x, review_y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=5)
#X_train = xgb.DMatrix(X_train)
#X_test  = xgb.DMatrix(X_test)
#y_train = xgb.DMatrix(y_train)
#y_test  = xgb.DMatrix(y_test)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
#print(X_train.shape)
#print(y_train.shape)
#print(review_x.shape)
#print(review_y.shape)

0.6212121212121212


In [88]:
type(X_train), X_train.shape

(scipy.sparse.csr.csr_matrix, (13200, 20294))

In [89]:
type(y_train), y_train.shape

(numpy.ndarray, (13200, 5))

In [104]:
y_train.shape

(13200, 5)