<a href="https://colab.research.google.com/github/CurtCalledBurt/DS_Unit2_Build_Week/blob/master/Project_2_Day_6_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

df = pd.read_csv('/Users/curtismckendrick/Downloads/games.csv')

In [0]:
df.columns

Index(['id', 'type', 'name', 'yearpublished', 'minplayers', 'maxplayers',
       'playingtime', 'minplaytime', 'maxplaytime', 'minage', 'users_rated',
       'average_rating', 'bayes_average_rating', 'total_owners',
       'total_traders', 'total_wanters', 'total_wishers', 'total_comments',
       'total_weights', 'average_weight'],
      dtype='object')

In [0]:
# from the glossary I have been able to deduce that:

# traders is either:
# 1. the number of people who own the game and are willing to trade it, or
# 2. the number of people who want the game through trade, not purchase

# wanters are those who want the game through trade, typically
# wishers are those who want to buy the game

# a weight on the site is an abstract measure of how complex a game is, 
# this number ranges from 1 to 5
# So,

# total_weights is either: 
# 1. the total number of people have submitted a "weight" for the game or
# 2. the total number of all the weights submitted and added up,
# and

# average_weight is most likely an average of all the measures of weight
# submitted by users. This seems most likely as this column ranges from
# 1 to 5, like the weights themselves would.


column_names = {'yearpublished': 'year_published', 'minplayers': 'min_players',
               'maxplayers': 'max_players','playingtime': 'playtime',
                'minplaytime': 'min_playtime', 'maxplaytime': 'max_playtime', 
                'minage': 'min_age'}
df = df.rename(columns=column_names)

In [0]:
# feature selection

# We try predicting 'average_rating' and see what happens with that.
target = 'average_rating'

# Obviously we wouldn't know the bayes_average_rating if we knew the average rating, 
# so we remove that from the features. 

things_we_cannot_know = ['bayes_average_rating']

things_we_do_not_need = ['name', 'type', 'id']

# turns out max_playtime and playtime are the same column in all 
# entrees except for three of them, so we'll drop playtime
redundant = ['playtime']

all_in_all = things_we_cannot_know + things_we_do_not_need + redundant


features = df.columns.drop([target] + all_in_all)
X = df[features]
y = df[target]


# We'll make a few things

# Okay, I still don't get this whole SetWithoutCopy Error thing,
# but it looks like after making new columns of X with binary operations you should 
# overwrite a copy of X with the new column and then make a new copy.
# This is the only line of code of all the feature making lines that requires this.
X=X.copy()
# X['min_age_15_or_higher'] = X['min_age'] >= 15
X=X.copy()

# X['total_interaction'] = X['total_owners'] + X['users_rated'] + X['total_traders'] + X['total_wanters'] + X['total_wishers'] + X['total_comments']
# X['total_owners_plus_raters'] = X['total_owners'] + X['users_rated']
# X['total_pos_interaction'] = X['total_owners'] + X['users_rated'] + X['total_comments']
# X['total_neg_interaction'] = X['total_traders'] + X['total_comments']
# X['hype'] = X['total_wanters'] + X['total_wishers'] + X['users_rated']
# X['mean_playtime'] = (X['min_playtime'] + X['max_playtime']) / 2
# X['new_game'] = X['year_published'] > 2010


# normalized_df=(df-df.mean())/df.std()

X['users_rated'] = (X['users_rated'] - X['users_rated'].mean()) / X['users_rated'].std()

In [0]:
# train/test/val split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)

In [0]:
# train/test/val split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=2019)
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((45738, 14), (45738,), (20328, 14), (20328,), (15246, 14), (15246,))

In [0]:
# quick and dirty baseline regressor
# note it returns a list the same size as the input

def baseline_Regressor(target):
  from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

  # making a list of the same size as 'target' 
  # with the mean of the target as the only possible entree
  mean_target = target.mean()
  pred_baseline = [mean_target] * len(target)
  
  #calculating baseline error and score
  mae = mean_absolute_error(target, pred_baseline)
  mse = mean_squared_error(target, pred_baseline)
  r2 = r2_score(target, pred_baseline)
  
  # printing the errors and scores
  return pred_baseline, mae, mse, r2

In [0]:
# get a list of baseline predictions
y_pred_base, mae_base, mse_base, r2_base = baseline_Regressor(y_train)
mae_base, mse_base, r2_base

(2.711469150949111, 9.321468469749549, 0.0)

In [0]:
# Remember that eli5 doesn't play well with pipelines, so you need to make your
# model outside of a pipeline.
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# we make our encoder and imputer in a pipeline
processor = make_pipeline(
    SimpleImputer(strategy='median')
)

# we fit the processor onto the data
X_train_processed = processor.fit_transform(X_train)
X_val_processed = processor.transform(X_val)

# we make our model outside the pipeline
pipeline = RandomForestRegressor(
        n_estimators=150,
        max_depth=15,
        min_samples_leaf=20,
        random_state=2019,
        n_jobs=-1
)

# and fit our model to our processed data
pipeline.fit(X_train_processed, y_train);
# y_pred_proba = pipeline.predict(X_val_processed)

In [0]:
# # Random forest model

# # more robust modelling
# # we'll start with a random forest before making our way to xgboost
# from sklearn.pipeline import make_pipeline
# from sklearn.impute import SimpleImputer
# import category_encoders as ce
# from sklearn.ensemble import RandomForestRegressor


# pipeline = make_pipeline(
#     ce.OneHotEncoder(),
#     SimpleImputer(strategy='median'),
#     RandomForestRegressor(n_estimators=150, random_state=2019, n_jobs=-1)
# )
# # Fit the model
# pipeline.fit(X_train, y_train);

In [0]:
# RandomForest error

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def error(X_train_processed, y_train, pipeline, train_test_or_val):
    kind = train_test_or_val

    #make a prediction on the train set
    y_pred = pipeline.predict(X_train_processed)

    # MAE of train set
    mae = mean_absolute_error(y_train, y_pred)
    print('MAE ' + kind + ' Error: ', mae)

    # RMSE of train set
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    print('RMSE ' + kind + ' Error: ', rmse)

    # R2 score of train set
    r2 = r2_score(y_train, y_pred)
    print(kind + ' R^2 Score: ', r2)
    
    return None

In [0]:
error(X_train_processed, y_train, pipeline, 'Train')

MAE Train Error:  0.5659301079857182
RMSE Train Error:  0.9546100749471084
Train R^2 Score:  0.9022385369700222


In [0]:
error(X_val_processed, y_val, pipeline, 'Val')

# with just plain users_rated
# MAE Val Error:  0.5942241404812977
# RMSE Val Error:  0.9929736252074519
# Val R^2 Score:  0.894620317941877

# with plain and normalized users_rated
# MAE Val Error:  0.5942258345754675
# RMSE Val Error:  0.9929811116576077
# Val R^2 Score:  0.894618728931472

# with just normalized users_rated
# MAE Val Error:  0.5942250790023598
# RMSE Val Error:  0.9929742554108404
# Val R^2 Score:  0.8946201841807134

# with neither
# MAE Val Error:  1.2616824409958685
# RMSE Val Error:  1.8787108938546002
# Val R^2 Score:  0.6227742202496318

MAE Val Error:  0.5942422137467644
RMSE Val Error:  0.9930003536702487
Val R^2 Score:  0.8946146447301251


In [0]:
import eli5
from eli5.sklearn import PermutationImportance

# See how defining an eli5 permuter takes a model but no data? that's
# why your data encoders could be used in a pipeline, but your model couldn't;
# the model needs to get passed in at definition. The data does not. 
permuter = PermutationImportance(
    pipeline,
    scoring='neg_mean_absolute_error',
    n_iter=5,
#     random_state=2019
)

# Now the data gets passed in.
permuter.fit(X_val_processed, y_val);

In [0]:
feature_names = X_val.columns.tolist()
pd.Series(permuter.feature_importances_, feature_names).sort_values(ascending=False)

eli5.show_weights(
    permuter, 
    top=None,
    feature_names=feature_names
)

Weight,Feature
2.1692  ± 0.0394,users_rated
0.1312  ± 0.0056,total_wishers
0.1097  ± 0.0064,year_published
0.0680  ± 0.0009,average_weight
0.0558  ± 0.0019,total_wanters
0.0409  ± 0.0023,total_traders
0.0275  ± 0.0027,total_owners
0.0110  ± 0.0012,min_age
0.0092  ± 0.0008,max_players
0.0087  ± 0.0016,max_playtime


In [0]:
# we make another model using just the most effective feature, users_rated


# we make our encoder and imputer in a pipeline
processor = make_pipeline(
    SimpleImputer(strategy='median')
)

# we fit the processor onto the data

X_train_users = np.array(X_train['users_rated'])
X_val_users = np.array(X_val['users_rated'])

X_train_users = X_train_users.reshape(-1,1)
X_val_users = X_val_users.reshape(-1,1)

X_train_users = processor.fit_transform(X_train_users)
X_val_users = processor.transform(X_val_users)

# we make our model outside the pipeline
model = RandomForestRegressor(
        n_estimators=150,
        max_depth=15,
        min_samples_leaf=20,
        random_state=2019,
        n_jobs=-1
)

# and fit our model to our processed data
model.fit(X_train_users, y_train);
y_pred = model.predict(X_train_users)

In [0]:
error(X_train_users, y_train, model, 'Train')

MAE Train Error:  0.8215113717733912
RMSE Train Error:  1.2755288259049393
Train R^2 Score:  0.8254594980399964


In [0]:
error(X_val_users, y_val, model, 'Val')

MAE Val Error:  0.816878013284002
RMSE Val Error:  1.2657551576021593
Val R^2 Score:  0.8287696008811412
