In [1]:
# Importing Modules and CSV

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("shot_logs.csv")

In [3]:
# View df and remove uneccssary columns

df.head()

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,3,1,0:00,,3,...,10.1,2,missed,"Bogdanovic, Bojan",202711,0.9,0,0,brian roberts,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,...,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,...,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148


In [4]:
#"Shot_Clock" is initially a object and it is converted to a float for use in predictions

df['SHOT_CLOCK'].astype(str).astype(float)

0         10.8
1          3.4
2          NaN
3         10.3
4         10.9
5          9.1
6         14.5
7          3.4
8         12.4
9         17.4
10        16.0
11        12.1
12         4.3
13         4.4
14         6.8
15         6.4
16        17.6
17         8.7
18        20.8
19        17.5
20        19.5
21         6.0
22        15.7
23        11.2
24         NaN
25        17.1
26        15.4
27        12.3
28        18.2
29        11.9
          ... 
128039     7.9
128040     8.8
128041    10.3
128042     4.0
128043    12.1
128044     8.4
128045     9.0
128046    19.3
128047     7.3
128048     9.0
128049    11.0
128050    22.7
128051     NaN
128052    19.3
128053     3.2
128054     4.1
128055    13.8
128056     7.1
128057     7.3
128058    14.3
128059    19.8
128060    11.4
128061    19.0
128062     7.0
128063    15.3
128064    18.3
128065    19.8
128066    23.0
128067     9.1
128068     NaN
Name: SHOT_CLOCK, Length: 128069, dtype: float64

In [5]:
df = df.drop(['MATCHUP', 'CLOSEST_DEFENDER', 'player_name', 'SHOT_RESULT', "GAME_CLOCK", "PTS"], axis=1)

In [6]:
#from pandas.plotting import scatter_matrix

#attributes = ["CLOSE_DEF_DIST", "SHOT_DIST", "FGM",
             # "SHOT_NUMBER", "DRIBBLES", "TOUCH_TIME"]
#scatter_matrix(df[attributes], figsize=(12, 8))

In [7]:
corr_matrix = df.corr()
corr_matrix["CLOSE_DEF_DIST"].sort_values(ascending=False)

CLOSE_DEF_DIST                1.000000
SHOT_DIST                     0.523192
PTS_TYPE                      0.414198
FINAL_MARGIN                  0.033064
SHOT_CLOCK                    0.026558
player_id                     0.008483
GAME_ID                       0.006739
FGM                          -0.001074
PERIOD                       -0.010204
CLOSEST_DEFENDER_PLAYER_ID   -0.015057
SHOT_NUMBER                  -0.037769
DRIBBLES                     -0.153674
TOUCH_TIME                   -0.164737
Name: CLOSE_DEF_DIST, dtype: float64

In [8]:
# Create category to split for Stratified Split

df["close_deg_dist_cat"] = np.ceil(df["CLOSE_DEF_DIST"] / 1.5)
df0=df["close_deg_dist_cat"].where(df["CLOSE_DEF_DIST"] < 5, 5, inplace=True)

In [9]:
df["close_deg_dist_cat"].value_counts()

5.0    37681
2.0    35542
3.0    32533
1.0    13659
4.0     7598
0.0     1056
Name: close_deg_dist_cat, dtype: int64

In [10]:
#Split Training and Testing on 'close_deg_dist_cat'

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=11)
for train_index, test_index in split.split(df, df["close_deg_dist_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [11]:
strat_train_set["close_deg_dist_cat"].value_counts()

5.0    30145
2.0    28434
3.0    26026
1.0    10927
4.0     6078
0.0      845
Name: close_deg_dist_cat, dtype: int64

In [12]:
strat_test_set["close_deg_dist_cat"].value_counts()

5.0    7536
2.0    7108
3.0    6507
1.0    2732
4.0    1520
0.0     211
Name: close_deg_dist_cat, dtype: int64

In [13]:
#Drop added cat

for set_ in (strat_train_set, strat_test_set):
    set_.drop("close_deg_dist_cat", axis=1, inplace=True)

In [14]:
#Drop Predicted column

df = strat_train_set.drop("FGM", axis=1)
df_labels = strat_train_set["FGM"].copy()

In [15]:
#Create Pipleine for preprossesing

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [16]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102455 entries, 574 to 66500
Data columns (total 14 columns):
GAME_ID                       102455 non-null int64
LOCATION                      102455 non-null object
W                             102455 non-null object
FINAL_MARGIN                  102455 non-null int64
SHOT_NUMBER                   102455 non-null int64
PERIOD                        102455 non-null int64
SHOT_CLOCK                    98006 non-null float64
DRIBBLES                      102455 non-null int64
TOUCH_TIME                    102455 non-null float64
SHOT_DIST                     102455 non-null float64
PTS_TYPE                      102455 non-null int64
CLOSEST_DEFENDER_PLAYER_ID    102455 non-null int64
CLOSE_DEF_DIST                102455 non-null float64
player_id                     102455 non-null int64
dtypes: float64(4), int64(8), object(2)
memory usage: 11.7+ MB


In [18]:
#Convert Catagorical data using One Hot Encoder and prepare numerical data

from sklearn.preprocessing import OneHotEncoder

attribs = list(df)
cat_attribs = ["LOCATION", "W" ]
num_attribs = [x for x in attribs if x not in cat_attribs]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

df_prepared = full_pipeline.fit_transform(df)

In [19]:
df_prepared

array([[ 1.45438958, -0.84983223,  0.31563725, ...,  0.        ,
         1.        ,  0.        ],
       [-1.51135133, -0.69866914,  0.52778556, ...,  1.        ,
         1.        ,  0.        ],
       [-0.05367998, -1.75681075,  0.10348895, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.87168172, -0.32076142,  0.31563725, ...,  1.        ,
         1.        ,  0.        ],
       [ 1.55906279, -0.84983223, -0.10865936, ...,  0.        ,
         1.        ,  0.        ],
       [-0.44523532, -0.16959833,  0.31563725, ...,  1.        ,
         1.        ,  0.        ]])

In [20]:
#Test data on different models

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(df_prepared, df_labels)

df_predictions = lin_reg.predict(df_prepared)
lin_mse = mean_squared_error(df_labels, df_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.4830680826293779

In [21]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=11)
tree_reg.fit(df_prepared, df_labels)

df_predictions = tree_reg.predict(df_prepared)
tree_mse = mean_squared_error(df_labels, df_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [22]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=11)
forest_reg.fit(df_prepared, df_labels)

df_predictions = forest_reg.predict(df_prepared)
forest_mse = mean_squared_error(df_labels, df_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.21257454261293324

In [23]:
#create scoring method print for models

from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [24]:
tree_scores = cross_val_score(tree_reg, df_prepared, df_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_scores)

Scores: [-0.45998438 -0.45891079 -0.45325005 -0.4582276  -0.45949639 -0.46520254
 -0.46090776 -0.46900927 -0.45954124 -0.46188385]
Mean: -0.4606413873185236
Standard deviation: 0.003985554662780052


In [25]:
lin_scores = cross_val_score(lin_reg, df_prepared, df_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [0.4813493  0.48177983 0.48340235 0.48647565 0.48082738 0.48266153
 0.48274531 0.48655399 0.48346631 0.48245695]
Mean: 0.48317186031925397
Standard deviation: 0.001849411076293442


In [26]:
forest_scores = cross_val_score(forest_reg, df_prepared, df_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [0.50103348 0.50076654 0.50745371 0.50509796 0.50187527 0.50665941
 0.50683662 0.50678943 0.50498346 0.50209809]
Mean: 0.5043593974616274
Standard deviation: 0.002511896733632229


In [27]:
#Perform Grid Search on Random Forest

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=11)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(df_prepared, df_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=11, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [28]:
#Perform Random Search on Random Forest

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=100),
        'max_features': randint(low=1, high=6),
    }

forest_reg = RandomForestRegressor(random_state=11)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=11)
rnd_search.fit(df_prepared, df_labels)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=11, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1185d2898>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1185d2da0>},
          pre_dispatch='2*n_jobs', random_state=11, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [29]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.48425146916415207 {'max_features': 2, 'n_estimators': 64}
0.4843194248245921 {'max_features': 1, 'n_estimators': 92}
0.4846810666829771 {'max_features': 2, 'n_estimators': 56}
0.48735155667641045 {'max_features': 5, 'n_estimators': 34}
0.48979234369716906 {'max_features': 3, 'n_estimators': 25}
0.4852221120671114 {'max_features': 5, 'n_estimators': 49}
0.48670412618948417 {'max_features': 1, 'n_estimators': 46}
0.5262808646385508 {'max_features': 5, 'n_estimators': 5}
0.49828473155938824 {'max_features': 3, 'n_estimators': 13}
0.4837046014576701 {'max_features': 2, 'n_estimators': 75}


In [30]:
#Determine final model and best_estimator

final_model1 = grid_search.best_estimator_
final_model2 = rnd_search.best_estimator_

X_test = strat_test_set.drop("FGM", axis=1)
y_test = strat_test_set["FGM"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions1 = final_model1.predict(X_test_prepared)
final_predictions2 = final_model2.predict(X_test_prepared)

final_mse1 = mean_squared_error(y_test, final_predictions1)
final_rmse1 = np.sqrt(final_mse1)
print(final_rmse1)

final_mse2 = mean_squared_error(y_test, final_predictions2)
final_rmse2 = np.sqrt(final_mse2)
print(final_rmse2)

0.4897115213868788
0.4840253910665643


In [31]:
final_model1 = rnd_search.best_estimator_

X_test = strat_test_set.drop("FGM", axis=1)
y_test = strat_test_set["FGM"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions1 = final_model1.predict(X_test_prepared)

final_mse1 = mean_squared_error(y_test, final_predictions1)
final_rmse1 = np.sqrt(final_mse1)
print(final_rmse1)

0.4840253910665643


In [32]:
from sklearn.externals import joblib
joblib.dump(final_model1, "Shot_Made_v2.pkl")

['Shot_Made_v2.pkl']