In [12]:
import pandas as pd
import json
import sklearn.metrics as metrics
from itertools import product

from sklearn.model_selection import train_test_split #from splitter import splitter

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

from wrangle import wrangle_prep
from wrangle import wrangle_prep_player
from modeling import baseline_model_maker
from modeling import model_maker
from modeling import test_model

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# I. General Player Model

### 1. Acquire wrangled (split, encoded and scaled) dataframe

In [4]:
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep()

Train = 47167 rows (56.0%) | Validate = 20215 rows (24.0%) | Test = 16846 rows (20.0%)


In [17]:
X_train

Unnamed: 0,abs_time,play_time,since_rest,score_margin,points,games_played,tm_v2,distance,zone_Center,zone_L Above Break,zone_L Below Break/Corner,zone_L Center,zone_R Above Break,zone_R Below Break/Corner,zone_R Center,period_4
77133,0.639743,0.733971,0.086698,0.470588,0.000000,3,0.350000,0.499298,0,0,1,0,0,0,0,1
10088,0.099224,0.114471,0.137397,0.522876,0.000000,22,1.733049,0.732245,0,0,0,0,1,0,0,0
64509,0.204065,0.235421,0.182259,0.431373,0.078947,31,0.430970,0.542206,0,0,0,0,1,0,0,0
28690,0.531426,0.531317,0.420068,0.431373,0.184211,17,1.330409,0.556523,1,0,0,0,0,0,0,0
39117,0.017384,0.020056,0.025787,0.483660,0.000000,4,1.565385,0.124159,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,0.581706,0.509719,0.065543,0.437908,0.118421,1,2.031250,0.283530,0,0,0,0,0,1,0,1
75741,0.684675,0.527306,0.147974,0.503268,0.131579,30,1.313630,0.521001,0,0,0,0,1,0,0,1
38630,0.364269,0.281086,0.076850,0.320261,0.026316,1,0.600000,0.607010,1,0,0,0,0,0,0,0
22961,0.730142,0.489972,0.040376,0.542484,0.118421,11,1.890625,0.631258,0,0,0,1,0,0,0,1


### 2. Establish leaguewide model baseline

In [6]:
baseline_model_maker(y_train, y_validate)[0]

Unnamed: 0,model,train_accuracy,validate_accuracy
0,Baseline Model,0.642907,0.642938


In [7]:
BASELINE_ACCURACY = baseline_model_maker(y_train, y_validate)[1]

### 3. Run model program on the dataset using 7 models types (including ensemble models) - 125 models total with hyperparameters.

In [2]:
drop_list = ['abs_time','since_rest']

In [None]:
models = model_maker(X_train, y_train, X_validate, y_validate, drop_list, baseline_acc = BASELINE_ACCURACY)

In [None]:
models[models.better_than_baseline == True].sort_values('validate_accuracy', ascending = False)

### 4. Use best model to test (note, must be manually entered)

In [8]:
output = test_model(X_train, y_train, X_validate, y_validate, X_test, y_test, drop_list, baseline_acc = BASELINE_ACCURACY)


In [9]:
output

Unnamed: 0,model,attributes,train_accuracy,validate_accuracy,test_accuracy,better_than_baseline,beats_baseline_by
0,Decision Tree,max_depth = 6,0.65726,0.654563,0.649056,True,0.006149


### 5. Create predictions csv [turn to function for final in modeling.py]

In [None]:
tree = DecisionTreeClassifier(max_depth=6, random_state=123)

In [None]:
tree = tree.fit(X_train, y_train)

In [None]:
y_tree_predict = tree.predict(X_test)

In [None]:
y_tree_proba = tree.predict_proba(X_test)

In [None]:
proba_df = pd.DataFrame(y_tree_proba, columns=tree.classes_.tolist()).round(4)

In [None]:
reset_test = (pd.concat([X_test, y_test], axis = 1).reset_index())

In [None]:
test_proba_df = pd.concat([reset_test, proba_df], axis=1)

In [None]:
test_proba_df = test_proba_df.merge(df, how = 'inner', left_on = 'index', right_index = True)

In [None]:
test_proba_df['predicted'] = y_tree_predict

In [None]:
csv_df = test_proba_df[['player','index','Made Shot', 'Missed Shot', 'predicted','shot_result_x']]
csv_df = csv_df.rename(columns = {'shot_result_x':'actual'})

In [None]:
csv_df.predicted.value_counts()

In [None]:
csv_df.info()
csv_df.head()

In [None]:
csv_df.to_csv('predictions.csv')

#### Altogether:

In [None]:
from modeling import predictions_generator

In [None]:
predictions_generator(df, X_train, y_train, X_test, y_test)

# II. By-player model exploration

### 6. Single player test

In [None]:
# For Trae Young
player_id = 1629027

In [None]:
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep_player(player_id)

In [None]:
baseline_model_maker(y_train, y_validate)[0]

In [None]:
BASELINE_ACCURACY = baseline_model_maker(y_train, y_validate)[1]

In [None]:
models = model_maker(X_train, y_train, X_validate, y_validate, ['abs_time','play_time','since_rest'], baseline_acc = BASELINE_ACCURACY)

In [None]:
models[models.better_than_baseline == True].sort_values('validate_accuracy', ascending = False).head(1)

### 7. Analyzing all elite players for modeling

In [None]:
# Reset dataframes
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep()

In [None]:
# Create a Series of v2 scores, binned by player
tm_v2_scores = df.groupby('player').tm_v2.mean()
# Calculate the std and mean
stddev = tm_v2_scores.std()
meanscore = tm_v2_scores.mean()
# Create an elite cutoff score at two standard deviations above the mean
elites = meanscore + 2 * stddev
# Print the list of 'elite' players
elites_list = tm_v2_scores[tm_v2_scores > elites].index

In [None]:
elites_list = df[df.player.isin(elites_list)]

In [None]:
player_id_list = elites_list.player_id.unique()
player_name_list = elites_list.player.unique()
elites_tuple = list(zip(player_id_list, player_name_list))

In [None]:
best_models = pd.DataFrame()
for player in elites_tuple:
    print(player[1])
    df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep_player(player[0])
    baseline_model_maker(y_train, y_validate)[0]
    BASELINE_ACCURACY = baseline_model_maker(y_train, y_validate)[1]
    models = model_maker(X_train, y_train, X_validate, y_validate, ['abs_time','play_time','since_rest'], baseline_acc = BASELINE_ACCURACY)
    best_model = models[models.better_than_baseline == True].sort_values('validate_accuracy', ascending = False).head(1)
    best_model['baseline'] = BASELINE_ACCURACY
    best_model['player'] = player[1]
    best_models = pd.concat([best_models, best_model])
best_models

In [None]:
best_models = best_models.set_index('player')

In [None]:
best_models = best_models[['model','attributes','baseline','train_accuracy','validate_accuracy']]
best_models

Note: Lebron James had no models that performed better than baseline.

#### Altogether - Elites

In [None]:
# Reset 
df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_prep()

In [None]:
from modeling import best_model_elites

In [None]:
best_models = best_model_elites(df, df_outlier_3pt, X_train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test)

#### Altogether - Keldon

In [13]:
from modeling import player_model

In [14]:
player_name = 'Keldon Johnson'

In [15]:
best_model = player_model(df, X_train, y_train, X_validate, y_validate, player_name)

> Keldon Johnson
Train = 221 rows (56.0%) | Validate = 95 rows (24.0%) | Test = 80 rows (20.0%)


In [16]:
best_model

Unnamed: 0_level_0,model,attributes,baseline,train_accuracy,validate_accuracy,validate_improvement_over_baseline
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Keldon Johnson,Random Forest Classifier,leafs = 2 : depth = 2 : trees = 300,0.597285,0.692308,0.663158,0.065873
