# Modeling 
---

## Load in our data for modeling and any necessary imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import (cross_val_score, train_test_split, 
                                     cross_val_predict, StratifiedKFold, GridSearchCV)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from nltk.stem.porter import *
from nltk.stem import PorterStemmer

plt.style.use('fivethirtyeight')

np.random.seed(72)

In [7]:
model_reviews = pd.read_csv('../data/final_reviews_modeling.csv')

print(model_reviews.shape)
model_reviews.head()

(79768, 116)


Unnamed: 0,num_players,critic_score,avg_user_score,user_score,target,console_ps4,console_ps5,console_switch,console_xbox-series-x,console_xboxone,...,souls,story,thing,things,think,time,times,want,way,world
0,30,97,8.3,6,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30,97,8.3,10,0,1,0,0,0,0,...,0.0,0.0,0.0,0.417952,0.0,0.0,0.0,0.0,0.0,0.0
2,30,97,8.3,7,0,1,0,0,0,0,...,0.0,0.147287,0.0,0.229875,0.0,0.170985,0.0,0.0,0.0,0.0
3,30,97,8.3,10,0,1,0,0,0,0,...,0.0,0.0,0.0,0.637094,0.0,0.0,0.0,0.0,0.0,0.0
4,30,97,8.3,4,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.204101,0.0,0.174666,0.0


## We will be using a Logisitic Regression and a Random Forests model for their interpretability with their coefficients. 

## Let's create a function to print out model score and cross val score along with confidence interval

In [8]:
def model_scores(cv_scores, model, X, y, model_name):  
    print(f'Score from {model_name}: {model.score(X, y):.4f}')
    print(f'Min and Max scores are: [{cv_scores.min():.4f}, {cv_scores.max():.4f}]')
    print(f'Confidence interval is : {cv_scores.mean():.4f} \u00B1 {2*cv_scores.std():.4f}')