# Nintendo Tweets Modeling

In [1]:
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
path = "/Users/jasonzhou/Documents/GitHub/NintendoTweets/Documents/Capstone3"
os.chdir(path)

smashtraining = pd.read_csv('smashtraining.csv')
firetraining = pd.read_csv('firetraining.csv')
partytraining = pd.read_csv('partytraining.csv')

smashsamples = pd.read_csv('smashsamples.csv')
firesamples = pd.read_csv('firesamples.csv')
partysamples = pd.read_csv('partysamples.csv')

In [3]:
def makeCountVec(df):
    vectorizer = CountVectorizer(min_df=0)
    vectorizer.fit(df['cleanedtext'])
    array = vectorizer.transform(df['cleanedtext'])
    array = array.toarray()
    features = vectorizer.get_feature_names()
    return array, features, vectorizer

In [4]:
smashX, smashfeatures, smashvectorizer = makeCountVec(smashtraining)
smashy = smashtraining['label']

fireX, firefeatures, firevectorizer, = makeCountVec(firetraining)
firey = firetraining['label']

partyX, partyfeatures, partyvectorizer = makeCountVec(partytraining)
partyy = partytraining['label']

Here what we need to do is remove the rows corresponding to the manually labeled samples we will use to ultimately validate the models on the accuracy of TextBlob. Because of our work during the preprocessing phase of the project, I just need to drop the last n rows of each X matrix, with n corresponding to the length of each sample.

The reason we do this after the vectorizing instead of before is so that the vectorizers are able to fit to every token in the data. 

In [5]:
smashX = smashX[:-len(smashsamples),]
fireX = fireX[:-len(firesamples), :]
partyX = partyX[:-len(partysamples), :]

In [6]:
# Need to do the same for the y vectors too

smashy = smashy[:-len(smashsamples)]
firey = firey[:-len(firesamples)]
partyy = partyy[:-len(partysamples)]

For reference, each game will be mapped to a number, in the interest of variable name lengths:

- Smash Bros. Ultimate: 1

- Fire Emblem: Three Houses: 2

- Super Mario Party: 3

In [7]:
from sklearn.model_selection import train_test_split

Xtr1, Xte1, ytr1, yte1 = train_test_split(smashX, smashy, test_size=0.3, random_state=1, stratify = smashy)
Xtr2, Xte2, ytr2, yte2 = train_test_split(fireX, firey, test_size=0.3, random_state=1, stratify = firey)
Xtr3, Xte3, ytr3, yte3 = train_test_split(partyX, partyy, test_size=0.3, random_state=1, stratify = partyy)

Now that we have training and testing data for each of our collection of tweets, I'm going to be looking at the baseline performances of four models on each of our data sets. The four models are the following:

- Logistic Regression
- Random Forest Classifier
- XGBClassifier
- SVC

I'm going to determine the performance of each of these models using a baseline version of each. Each different type of model will be tested on each of our sets of data, which means we are going to end up with 12 sets of results in the end, that will be compiled into a table. 

Below defined are helper functions that will help not only to create baseline models but to determine their performance by printing out classification reports and confusion matrices. 

In [8]:
from matplotlib.ticker import IndexLocator
import itertools

def plot_cm(y_test,y_pred_class,classes=['NON-default','DEFAULT']):
    # plot confusion matrix
    fig, ax = plt.subplots()
    cm = confusion_matrix(y_test, y_pred_class)
    
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    plt.title("Confusion Matrix")
    ax.set(yticks=[-0.5, 1.5], 
           xticks=[0, 1], 
           yticklabels=classes, 
           xticklabels=classes)
    ax.yaxis.set_major_locator(IndexLocator(base=1, offset=0.5))
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
def scores(y_test, y_pred_class):
    # Prints formatted classification metrics. 
    print('Classification Accuracy: ', format(accuracy_score(y_test, y_pred_class), '.3f'))
    print('Precision score: ', format(precision_score(y_test, y_pred_class), '.3f'))
    print('Recall score: ', format(recall_score(y_test, y_pred_class), '.3f'))
    print('F1 score: ', format(f1_score(y_test, y_pred_class), '.3f'))

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

def logiRegr(X_train, y_train, X_test, y_test,**kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    logreg = LogisticRegression(random_state=1,**kwargs)
    # Fit to training data.
    logreg.fit(X_train, y_train)
    # Examine coefficients
    #pprint.pprint(list(zip(X_train.columns,logreg.coef_[0])))
    # Class predictions (not predicted probabilities)
    y_pred_class = logreg.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Plot confusion matrix
    #plot_cm(y_test,y_pred_class)
    
def randomForest(X_train, y_train, X_test, y_test,**kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    rf = RandomForestClassifier(random_state=1, **kwargs) 
    # Fit to training data.
    rf.fit(X_train,y_train)
    # Class predictions
    y_pred_class = rf.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Confusion matrix
    #plot_cm(y_test,y_pred_class)
    
def xgbClass(X_train, y_train, X_test, y_test,**kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    xg = xgb.XGBClassifier(seed=1,**kwargs)
    # Fit to training data.
    xg.fit(X_train,y_train)
    # Class predictions
    y_pred_class = xg.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Confusion matrix
    #plot_cm(y_test,y_pred_class)
    
def svmClass(X_train, y_train, X_test, y_test, **kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    svm = SVC(random_state=1,**kwargs)
    # Fit to training data.
    svm.fit(X_train, y_train)
    # Class predictions
    y_pred_class = svm.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Plot confusion matrix
    #plot_cm(y_test,y_pred_class)

Here we're going to be creating and evaluating baseline models for each game. 4 types of models for 3 games, feel free to skip the lengthy amount of output, there will be a table at the end that summarizes the results.

In [10]:
logiRegr(Xtr1, ytr1, Xte1, yte1)

Classification Accuracy:  0.982
Precision score:  0.986
Recall score:  0.996
F1 score:  0.991


In [11]:
randomForest(Xtr1, ytr1, Xte1, yte1)

Classification Accuracy:  0.983
Precision score:  0.990
Recall score:  0.992
F1 score:  0.991


In [12]:
xgbClass(Xtr1, ytr1, Xte1, yte1)

Classification Accuracy:  0.982
Precision score:  0.988
Recall score:  0.993
F1 score:  0.990


In [13]:
svmClass(Xtr1, ytr1, Xte1, yte1)

Classification Accuracy:  0.983
Precision score:  0.987
Recall score:  0.996
F1 score:  0.991


In [14]:
logiRegr(Xtr2, ytr2, Xte2, yte2)

Classification Accuracy:  0.983
Precision score:  0.983
Recall score:  1.000
F1 score:  0.992


In [15]:
randomForest(Xtr2, ytr2, Xte2, yte2)

Classification Accuracy:  0.986
Precision score:  0.990
Recall score:  0.995
F1 score:  0.993


In [16]:
xgbClass(Xtr2, ytr2, Xte2, yte2)

Classification Accuracy:  0.991
Precision score:  0.995
Recall score:  0.995
F1 score:  0.995


In [17]:
svmClass(Xtr2, ytr2, Xte2, yte2)

Classification Accuracy:  0.983
Precision score:  0.983
Recall score:  1.000
F1 score:  0.992


In [18]:
logiRegr(Xtr3, ytr3, Xte3, yte3)

Classification Accuracy:  0.981
Precision score:  0.985
Recall score:  0.995
F1 score:  0.990


In [19]:
randomForest(Xtr3, ytr3, Xte3, yte3)

Classification Accuracy:  0.984
Precision score:  0.992
Recall score:  0.991
F1 score:  0.992


In [20]:
xgbClass(Xtr3, ytr3, Xte3, yte3)

Classification Accuracy:  0.982
Precision score:  0.988
Recall score:  0.993
F1 score:  0.991


In [21]:
svmClass(Xtr3, ytr3, Xte3, yte3)

Classification Accuracy:  0.982
Precision score:  0.986
Recall score:  0.995
F1 score:  0.991


Summary of Classification Accuracies

Game1: Smash Bros Ultimate

Game2: Fire Emblem: Three Houses

Game3: Super Mario Party


|      |Game1 |Game2 |Game3 |
|------|------|------|------|
|LogReg|0.982 |0.983 |0.981 |
|RanFor|0.983 |0.986 |0.984 |
|XGB   |0.982 |0.991 |0.982 |
|SVM   |0.983 |0.983 |0.982 |

# Hyperparameter Tuning

In [22]:
def printGridResult(grid_result):
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

WARNING: Long walls of outputs incoming. 12 sets of hyperparameter tuning performed, one for each model for each game (4x3). Results summarized at the end.

In [23]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
model = LogisticRegression()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(Xtr1, ytr1)
printGridResult(grid_result)

Best: 0.986237 using {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.986121 (0.002538) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.986121 (0.002538) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.986237 (0.002527) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.984376 (0.002980) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.984376 (0.002980) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.984415 (0.002944) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.978716 (0.002111) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.978716 (0.002111) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.978832 (0.002079) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.963984 (0.002585) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.963984 (0.002585) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.965031 (0.002775) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.946150 (0.000270) wit

In [24]:
grid_result = grid_search.fit(Xtr2, ytr2)
printGridResult(grid_result)

Best: 0.987132 using {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.987130 (0.004855) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.987130 (0.004855) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.987132 (0.005820) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.985775 (0.003797) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.985775 (0.003797) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.986453 (0.004000) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.983067 (0.004778) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.983067 (0.004778) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.985436 (0.003642) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.981710 (0.002468) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.981710 (0.002468) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.981710 (0.002468) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.981710 (0.002468) wit

In [25]:
grid_result = grid_search.fit(Xtr3, ytr3)
printGridResult(grid_result)

Best: 0.985996 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.985996 (0.002284) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.985996 (0.002284) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.985958 (0.002304) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.985002 (0.002136) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.985002 (0.002136) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.985116 (0.002172) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.979607 (0.002522) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.979607 (0.002522) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.979683 (0.002478) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.965526 (0.002802) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.965526 (0.002802) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.966062 (0.002481) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.946319 (0.000279) wit

In [None]:
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = [None, 'sqrt', 'log2']
grid = dict(n_estimators=n_estimators,max_features=max_features)

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(Xtr1, ytr1)
printGridResult(grid_result)

In [None]:
grid_result = grid_search.fit(Xtr2, ytr2)
printGridResult(grid_result)

In [None]:
grid_result = grid_search.fit(Xtr3, ytr3)
printGridResult(grid_result)

In [None]:
model = xgb.XGBClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
# define grid search
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(Xtr1, ytr1)
printGridResult(grid_result)

In [None]:
grid_result = grid_search.fit(Xtr2, ytr2)
printGridResult(grid_result)

In [None]:
grid_result = grid_search.fit(Xtr3, ytr3)
printGridResult(grid_result)

In [None]:
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(Xtr1, ytr1)
printGridResult(grid_result)

In [None]:
grid_result = grid_search.fit(Xtr2, ytr2)
printGridResult(grid_result)

In [None]:
grid_result = grid_search.fit(Xtr3, ytr3)
printGridResult(grid_result)

Taking the best hyperparameters for each model per game, our classification accuracy scores are now the following:

|      |Game1 |Game2 |Game3 |
|------|------|------|------|
|LogReg|0.986 |0.987 |0.962 |
|RanFor|0.985 |0.988 |0.960 |
|XGB   |0.979 |0.982 |0.957 |
|SVM   |0.983 |0.985 |0.960 |


Table of baseline model scores redepicted below:

|      |Game1 |Game2 |Game3 |
|------|------|------|------|
|LogReg|0.982 |0.983 |0.945 |
|RanFor|0.984 |0.981 |0.950 |
|XGB   |0.982 |0.991 |0.950 |
|SVM   |0.983 |0.983 |0.945 |

Across the models we've built, it's now time to select the model that scores highest per game. 

- Game 1 (Super Smash Bros Ultimate): Logistic Regression with hyperparameters tuned
- Game 2 (Fire Emblem: Three Houses): Random Forest with hyperparameters tuned
- Game 3 (Super Mario Party): Logistic Regression with hyperparameters tuned


Let's build each one with the optimal hyperparameters, starting with Smash Bros.

In [None]:
# Smash Bro's best performing model, with hyperparameters tuned

smashmodel = LogisticRegression(C=100, penalty='l2',solver='liblinear')
smashmodel.fit(Xtr1, ytr1)

ypred = smashmodel.predict(Xte1)
scores(yte1, ypred)
plot_cm(yte1,ypred)

In [None]:
# Fire Emblem's best performing model, with hyperparameters tuned

firemodel = RandomForestClassifier(max_features=None, n_estimators=10)
firemodel.fit(Xtr2, ytr2)

ypred = firemodel.predict(Xte2)
scores(yte2, ypred)
plot_cm(yte2,ypred)

In [None]:
# Mario Party's best performing model, with hyperparameters tuned

partymodel = LogisticRegression(C=100, penalty='l2', solver='newton-cg')
partymodel.fit(Xtr3, ytr3)

ypred = partymodel.predict(Xte3)
scores(yte3, ypred)
plot_cm(yte3,ypred)

In [None]:
def makeCountVecSample(df):
    vectorizer = CountVectorizer(min_df=0)
    vectorizer.fit(df['Cleaned'])
    array = vectorizer.transform(df['Cleaned'])
    array = array.toarray()
    return array

In [None]:
smashsampletest = makeCountVecSample(smashsamples)

In [None]:
smashsampletest.shape