In [9]:
# imports

import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from matplotlib.ticker import IndexLocator
import itertools
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

In [2]:
# reading in files

path = "/Users/jasonzhou/Documents/GitHub/NintendoTweets/Documents/Capstone3"
os.chdir(path)

smashtraining = pd.read_csv('smashtraining.csv')
firetraining = pd.read_csv('firetraining.csv')
partytraining = pd.read_csv('partytraining.csv')

smashsamples = pd.read_csv('smashsamples.csv')
firesamples = pd.read_csv('firesamples.csv')
partysamples = pd.read_csv('partysamples.csv')

In [10]:
# function that creates count vectorizers of corpus, and returns counts array, feature names, 
# and the vectorizer itself

def makeCountVec(df):
    vectorizer = CountVectorizer(min_df=0)
    vectorizer.fit(df['cleanedtext'])
    array = vectorizer.transform(df['cleanedtext'])
    array = array.toarray()
    features = vectorizer.get_feature_names()
    return array, features, vectorizer

# classification report function

def scores(y_test, y_pred_class):    
    target_names = ['positive tweets', 'negative tweets']
    print(classification_report(y_test, y_pred_class, target_names=target_names))

In [4]:
smashX, smashfeatures, smashvectorizer = makeCountVec(smashtraining)
smashy = smashtraining['label']

fireX, firefeatures, firevectorizer, = makeCountVec(firetraining)
firey = firetraining['label']

partyX, partyfeatures, partyvectorizer = makeCountVec(partytraining)
partyy = partytraining['label']

In [5]:
smashX = smashX[:-len(smashsamples),]
fireX = fireX[:-len(firesamples), :]
partyX = partyX[:-len(partysamples), :]

# Need to do the same for the y vectors too

smashy = smashy[:-len(smashsamples)]
firey = firey[:-len(firesamples)]
partyy = partyy[:-len(partysamples)]

In [7]:
from sklearn.model_selection import train_test_split

Xtr1, Xte1, ytr1, yte1 = train_test_split(smashX, smashy, test_size=0.3, random_state=1, stratify = smashy)
Xtr2, Xte2, ytr2, yte2 = train_test_split(fireX, firey, test_size=0.3, random_state=1, stratify = firey)
Xtr3, Xte3, ytr3, yte3 = train_test_split(partyX, partyy, test_size=0.3, random_state=1, stratify = partyy)

# Feature Importance from Best Tree Models

In [11]:
# XGBClassifier for Smash Bros

smashXGB = xgb.XGBClassifier(max_depth=5, learning_rate=0.001, n_estimators=1000, scale_pos_weight=(8052/458))
smashXGB.fit(Xtr1, ytr1)
ypred = smashXGB.predict(Xte1)

scores(yte1, ypred)

                 precision    recall  f1-score   support

positive tweets       1.00      0.68      0.81      3452
negative tweets       0.15      0.98      0.26       196

       accuracy                           0.69      3648
      macro avg       0.57      0.83      0.53      3648
   weighted avg       0.95      0.69      0.78      3648



# Conclusions


Unfortunately our modeling turned out to be a failure. Due to the nature of our data, it was very important for our training data and model to be complex enough to be able to accurately detect the rare negative tweets. In other words the tolerance for our false postive rate was extremely low, for the models to be considered acceptable. The problem is that our bag-of-word representations lack the nuance to truly capture sentiment of the original tweets. Not only this, but TextBlob is most likely not an accurate labeler of our documents (tweets) here either. TextBlob's sentiment analysis is based on a model trained on movie reviews. The language used in movie reviews include many more clear postive or negative words, whereas the language used in tweets are more unfocused and include much more slang. Therefore even a bag-of-word representation of a movie review would still retain enough information to reasonably evaluate sentiment off of. Moving forward, the first step to creating models that accurately label tweets, could be to have a large enough training data set of accurately labeled tweets. 