# Modeling Workbook

### General League-wide Model & Individual Player Model
### Note: Due to time required for modeling much of this is markdowned out.

In [1]:
import pandas as pd
import json
import sklearn.metrics as metrics
from itertools import product

from sklearn.model_selection import train_test_split #from splitter import splitter

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

from wrangle import wrangle_prep
from wrangle import wrangle_prep_player
from modeling import baseline_model_maker
from modeling import model_maker
from modeling import test_model

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload



# I. General Player Model

### 1. Acquire wrangled (split, encoded and scaled) dataframe

In [2]:
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep()

Train = 47167 rows (56.0%) | Validate = 20215 rows (24.0%) | Test = 16846 rows (20.0%)


In [3]:
X_train

Unnamed: 0,abs_time,play_time,since_rest,score_margin,points,games_played,tm_v2,distance,zone_Center,zone_L Above Break,zone_L Below Break/Corner,zone_L Center,zone_R Above Break,zone_R Below Break/Corner,zone_R Center,period_4
77133,0.639743,0.733971,0.086698,0.470588,0.000000,0.0375,0.350000,0.499298,0,0,1,0,0,0,0,1
10088,0.099224,0.114471,0.137397,0.522876,0.000000,0.2750,1.733049,0.732245,0,0,0,0,1,0,0,0
64509,0.204065,0.235421,0.182259,0.431373,0.078947,0.3875,0.430970,0.542206,0,0,0,0,1,0,0,0
28690,0.531426,0.531317,0.420068,0.431373,0.184211,0.2125,1.330409,0.556523,1,0,0,0,0,0,0,0
39117,0.017384,0.020056,0.025787,0.483660,0.000000,0.0500,1.565385,0.124159,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,0.581706,0.509719,0.065543,0.437908,0.118421,0.0125,2.031250,0.283530,0,0,0,0,0,1,0,1
75741,0.684675,0.527306,0.147974,0.503268,0.131579,0.3750,1.313630,0.521001,0,0,0,0,1,0,0,1
38630,0.364269,0.281086,0.076850,0.320261,0.026316,0.0125,0.600000,0.607010,1,0,0,0,0,0,0,0
22961,0.730142,0.489972,0.040376,0.542484,0.118421,0.1375,1.890625,0.631258,0,0,0,1,0,0,0,1


### 2. Establish leaguewide model baseline

In [4]:
baseline_model_maker(y_train, y_validate)[0]

Unnamed: 0,model,train_accuracy,validate_accuracy
0,Baseline Model,0.642907,0.642938


In [5]:
BASELINE_ACCURACY = baseline_model_maker(y_train, y_validate)[1]

### 3. Run model program on the dataset using 7 models types (including ensemble models) - 125 models total with hyperparameters.

In [15]:
drop_list = ['abs_time','since_rest']

models = model_maker(X_train, y_train, X_validate, y_validate, drop_list, baseline_acc = BASELINE_ACCURACY)

models[models.better_than_baseline == True].sort_values('validate_accuracy', ascending = False)

### 4. Use best model to test (note, must be manually entered)

output = test_model(X_train, y_train, X_validate, y_validate, X_test, y_test, drop_list, baseline_acc = BASELINE_ACCURACY)


output

### 5. Create predictions csv [turn to function for final in modeling.py]

tree = DecisionTreeClassifier(max_depth=6, random_state=123)

tree = tree.fit(X_train, y_train)

y_tree_predict = tree.predict(X_test)

y_tree_proba = tree.predict_proba(X_test)

proba_df = pd.DataFrame(y_tree_proba, columns=tree.classes_.tolist()).round(4)

reset_test = (pd.concat([X_test, y_test], axis = 1).reset_index())

test_proba_df = pd.concat([reset_test, proba_df], axis=1)

test_proba_df = test_proba_df.merge(df, how = 'inner', left_on = 'index', right_index = True)

test_proba_df['predicted'] = y_tree_predict

csv_df = test_proba_df[['player','index','Made Shot', 'Missed Shot', 'predicted','shot_result_x']]
csv_df = csv_df.rename(columns = {'shot_result_x':'actual'})

csv_df.predicted.value_counts()

csv_df.info()
csv_df.head()

csv_df.to_csv('predictions.csv')

#### Altogether:

from modeling import predictions_generator

predictions_generator(df, X_train, y_train, X_test, y_test)

# II. By-player model exploration

### 6. Single player test

#For Trae Young
player_id = 1629027

df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep_player(player_id)

baseline_model_maker(y_train, y_validate)[0]

BASELINE_ACCURACY = baseline_model_maker(y_train, y_validate)[1]

models = model_maker(X_train, y_train, X_validate, y_validate, ['abs_time','play_time','since_rest'], baseline_acc = BASELINE_ACCURACY)

models[models.better_than_baseline == True].sort_values('validate_accuracy', ascending = False).head(1)

### 7. Analyzing all elite players for modeling

#Reset dataframes
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep()

#Create a Series of v2 scores, binned by player
tm_v2_scores = df.groupby('player').tm_v2.mean()
#Calculate the std and mean
stddev = tm_v2_scores.std()
meanscore = tm_v2_scores.mean()
#Create an elite cutoff score at two standard deviations above the mean
elites = meanscore + 2 * stddev
#Print the list of 'elite' players
elites_list = tm_v2_scores[tm_v2_scores > elites].index

elites_list = df[df.player.isin(elites_list)]

player_id_list = elites_list.player_id.unique()
player_name_list = elites_list.player.unique()
elites_tuple = list(zip(player_id_list, player_name_list))

best_models = pd.DataFrame()
for player in elites_tuple:
    print(player[1])
    df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep_player(player[0])
    baseline_model_maker(y_train, y_validate)[0]
    BASELINE_ACCURACY = baseline_model_maker(y_train, y_validate)[1]
    models = model_maker(X_train, y_train, X_validate, y_validate, ['abs_time','play_time','since_rest'], baseline_acc = BASELINE_ACCURACY)
    best_model = models[models.better_than_baseline == True].sort_values('validate_accuracy', ascending = False).head(1)
    best_model['baseline'] = BASELINE_ACCURACY
    best_model['player'] = player[1]
    best_models = pd.concat([best_models, best_model])
best_models

best_models = best_models.set_index('player')

best_models = best_models[['model','attributes','baseline','train_accuracy','validate_accuracy']]
best_models

Note: Lebron James had no models that performed better than baseline.

#### Altogether - Elites (Not Used on Test)

In [6]:
# Reset 
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep()

Train = 47167 rows (56.0%) | Validate = 20215 rows (24.0%) | Test = 16846 rows (20.0%)


In [7]:
from modeling import best_model_elites

In [8]:
best_model_elites(df, X_train, y_train, X_validate, y_validate)

> Jaylen Brown
Train = 251 rows (56.0%) | Validate = 109 rows (24.0%) | Test = 91 rows (20.0%)
> CJ McCollum
Train = 259 rows (56.0%) | Validate = 112 rows (24.0%) | Test = 93 rows (20.0%)
> Devonte' Graham
Train = 308 rows (56.0%) | Validate = 132 rows (24.0%) | Test = 111 rows (20.0%)
> Lonzo Ball
Train = 143 rows (56.0%) | Validate = 62 rows (24.0%) | Test = 52 rows (20.0%)
> Zach LaVine
Train = 259 rows (56.0%) | Validate = 111 rows (24.0%) | Test = 93 rows (20.0%)
> Klay Thompson
Train = 162 rows (56.0%) | Validate = 70 rows (24.0%) | Test = 58 rows (20.0%)
> Stephen Curry
Train = 369 rows (56.0%) | Validate = 159 rows (24.0%) | Test = 133 rows (20.0%)
> Luke Kennard
Train = 232 rows (56.0%) | Validate = 100 rows (24.0%) | Test = 83 rows (20.0%)
> Paul George
Train = 142 rows (56.0%) | Validate = 62 rows (24.0%) | Test = 52 rows (20.0%)
> Carmelo Anthony
Train = 219 rows (56.0%) | Validate = 94 rows (24.0%) | Test = 79 rows (20.0%)
> LeBron James
Train = 235 rows (56.0%) | Validat

Unnamed: 0_level_0,model,attributes,baseline,train_accuracy,validate_accuracy,validate_improvement_over_baseline
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Jaylen Brown,KNNeighbors,n_neighbors = 12,0.63745,0.649402,0.706422,0.068972
CJ McCollum,Random Forest Classifier,leafs = 2 : depth = 3 : trees = 250,0.610039,0.660232,0.633929,0.02389
Devonte' Graham,Random Forest Classifier,leafs = 2 : depth = 4 : trees = 250,0.655844,0.704545,0.689394,0.03355
Lonzo Ball,Decision Tree Classifier,max_depth = 2,0.573427,0.671329,0.66129,0.087864
Zach LaVine,Decision Tree Classifier,max_depth = 3,0.610039,0.656371,0.63964,0.029601
Klay Thompson,KNNeighbors,n_neighbors = 13,0.604938,0.648148,0.628571,0.023633
Stephen Curry,Random Forest Classifier,leafs = 1 : depth = 3 : trees = 150,0.601626,0.626016,0.616352,0.014726
Luke Kennard,BaggingClassifier,estimator = KNeighborsClassifier,0.547414,0.706897,0.6,0.052586
Paul George,LogisticRegression,,0.647887,0.676056,0.709677,0.06179
Carmelo Anthony,Random Forest Classifier,leafs = 1 : depth = 4 : trees = 150,0.621005,0.744292,0.680851,0.059846


#### Altogether - Keldon Johnson

In [17]:
from modeling import player_model

In [18]:
player_model(df, X_train, y_train, X_validate, y_validate, 'Keldon Johnson')

> Keldon Johnson
Train = 221 rows (56.0%) | Validate = 95 rows (24.0%) | Test = 80 rows (20.0%)


Unnamed: 0_level_0,model,attributes,baseline,train_accuracy,validate_accuracy,validate_improvement_over_baseline
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Keldon Johnson,Random Forest Classifier,leafs = 2 : depth = 2 : trees = 300,0.597285,0.692308,0.663158,0.065873


holder = []
for f in [X_train, pd.DataFrame(y_train), X_validate, pd.DataFrame(y_validate), X_test, pd.DataFrame(y_test)]:
    f = f.merge(df, how = 'inner', left_index = True, right_index = True)
    holder.append(f)

counter = 1
holder2 = []
for x in holder:
    x = x[x.player == 'Keldon Johnson']
    if counter%2 == 1:
        x = x.iloc[:,:16]
        x.columns = x.columns.str.strip('_x')
    else:
        x = x.iloc[:,:1]
        x.columns = x.columns.str.strip('_x')
        x = x.squeeze()
    counter += 1
    holder2.append(x)

In [13]:
from modeling import test_player

In [16]:
test_player('Keldon Johnson',df, X_train, y_train, X_validate, y_validate, X_test, y_test, drop_list, baseline_acc = BASELINE_ACCURACY)

Unnamed: 0,model,attributes,train_accuracy,validate_accuracy,test_accuracy,better_than_baseline,beats_baseline_by
0,Random Forest,leafs = 1 ; depth = 3 ; trees = 150,0.773333,0.6,0.625,False,-0.017907
