## Using resampling methods

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

## Read in CSV and perform basic cleaning

In [3]:
file_dir = '/Users/devingreenzang/desktop/challenges/Final/NBA_Final-Project/'
shot_log = pd.read_csv(f'{file_dir}shot_logs.csv')

In [4]:
file_dir = '/Users/devingreenzang/desktop/challenges/Final/NBA_Final-Project/'
players_log = pd.read_csv(f'{file_dir}players_stats.csv')

In [5]:
# Extracting date from MATCHUP column
shot_log['DATE'] = shot_log['MATCHUP'].str.split('-', expand = True)[0]

# Converting to datetime object
shot_log['DATE'] = pd.to_datetime(shot_log['DATE'])

# Turning gametime into datetime object
shot_log['GAME_CLOCK'] = pd.to_datetime(shot_log['GAME_CLOCK'], format = '%M:%S').dt.time 

# Adding date and time columns together
shot_log['TIMESTAMP'] = shot_log['DATE'].astype(str) + ' ' + shot_log['GAME_CLOCK'].astype(str)

# Converting to datetime object
shot_log['TIMESTAMP'] = pd.to_datetime(shot_log['TIMESTAMP'])

In [6]:
#converting the first letter in each string to uppercase 
shot_log['player_name'] = shot_log['player_name'].str.title()

In [7]:
shot_log.dtypes
#shot_log.drop(['DATE','TIMESTAMP'],axis =1)
shot_log = shot_log.dropna()

In [8]:
shot_log.dtypes
shot_log = shot_log.drop(['DATE','TIMESTAMP'],axis =1)

In [9]:
players_log.dtypes

Name             object
Games Played      int64
MIN               int64
PTS               int64
FGM               int64
FGA               int64
FG%             float64
3PM               int64
3PA               int64
3P%             float64
FTM               int64
FTA               int64
FT%             float64
OREB              int64
DREB              int64
REB               int64
AST               int64
STL               int64
BLK               int64
TOV               int64
PF                int64
EFF               int64
AST/TOV         float64
STL/TOV         float64
Age             float64
Birth_Place      object
Birthdate        object
Collage          object
Experience       object
Height          float64
Pos              object
Team             object
Weight          float64
BMI             float64
dtype: object

In [10]:
merged_df=pd.merge(shot_log, players_log, how='left', left_on='player_name', right_on='Name')
display(merged_df)

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,Age,Birth_Place,Birthdate,Collage,Experience,Height,Pos,Team,Weight,BMI
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,00:01:09,10.8,2,...,30.0,us,"December 3, 1985",University of Dayton,2,182.5,PG,CHA,77.85,23.373991
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,00:00:14,3.4,0,...,30.0,us,"December 3, 1985",University of Dayton,2,182.5,PG,CHA,77.85,23.373991
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,00:11:47,10.3,2,...,30.0,us,"December 3, 1985",University of Dayton,2,182.5,PG,CHA,77.85,23.373991
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,00:10:34,10.9,2,...,30.0,us,"December 3, 1985",University of Dayton,2,182.5,PG,CHA,77.85,23.373991
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,6,2,00:08:15,9.1,2,...,30.0,us,"December 3, 1985",University of Dayton,2,182.5,PG,CHA,77.85,23.373991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122497,21400006,"OCT 29, 2014 - BKN @ BOS",A,L,-16,4,2,00:05:05,15.3,2,...,32.0,us,"October 28, 1983",Georgia Institute of Technology,9,187.5,PG,NJN,90.00,25.600000
122498,21400006,"OCT 29, 2014 - BKN @ BOS",A,L,-16,5,3,00:01:52,18.3,5,...,32.0,us,"October 28, 1983",Georgia Institute of Technology,9,187.5,PG,NJN,90.00,25.600000
122499,21400006,"OCT 29, 2014 - BKN @ BOS",A,L,-16,6,4,00:11:28,19.8,4,...,32.0,us,"October 28, 1983",Georgia Institute of Technology,9,187.5,PG,NJN,90.00,25.600000
122500,21400006,"OCT 29, 2014 - BKN @ BOS",A,L,-16,7,4,00:11:10,23.0,2,...,32.0,us,"October 28, 1983",Georgia Institute of Technology,9,187.5,PG,NJN,90.00,25.600000


In [17]:
merged_df = merged_df.dropna()

In [27]:
merged_df.dtypes

GAME_ID                         int64
MATCHUP                        object
LOCATION                       object
W                              object
FINAL_MARGIN                    int64
SHOT_NUMBER                     int64
PERIOD                          int64
GAME_CLOCK                     object
SHOT_CLOCK                    float64
DRIBBLES                        int64
TOUCH_TIME                    float64
SHOT_DIST                     float64
PTS_TYPE                        int64
SHOT_RESULT                    object
CLOSEST_DEFENDER               object
CLOSEST_DEFENDER_PLAYER_ID      int64
CLOSE_DEF_DIST                float64
FGM_x                           int64
PTS_x                           int64
player_name                    object
player_id                       int64
Name                           object
Games Played                  float64
MIN                           float64
PTS_y                         float64
FGM_y                         float64
FGA         

In [29]:
merged_df = merged_df.drop(['GAME_ID','AST/TOV','STL/TOV','Birth_Place','Birthdate'],axis =1)
merged_df.dtypes

MATCHUP                        object
LOCATION                       object
W                              object
FINAL_MARGIN                    int64
SHOT_NUMBER                     int64
PERIOD                          int64
GAME_CLOCK                     object
SHOT_CLOCK                    float64
DRIBBLES                        int64
TOUCH_TIME                    float64
SHOT_DIST                     float64
PTS_TYPE                        int64
SHOT_RESULT                    object
CLOSEST_DEFENDER               object
CLOSEST_DEFENDER_PLAYER_ID      int64
CLOSE_DEF_DIST                float64
FGM_x                           int64
PTS_x                           int64
player_name                    object
player_id                       int64
Name                           object
Games Played                  float64
MIN                           float64
PTS_y                         float64
FGM_y                         float64
FGA                           float64
FG%         

## Split into test, training group -- looking to see if shot is missed or made

In [30]:
# Create our features
X = merged_df.drop("SHOT_RESULT", axis=1)

X = pd.get_dummies(X)

# Create our target
y = merged_df["SHOT_RESULT"]

In [31]:
X.describe()

Unnamed: 0,FINAL_MARGIN,SHOT_NUMBER,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,...,Team_OKC,Team_ORL,Team_PHI,Team_PHO,Team_POR,Team_SAC,Team_SAS,Team_TOR,Team_UTA,Team_WAS
count,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,...,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0,77636.0
mean,0.432531,6.588709,2.477176,12.446296,2.110786,2.827128,13.754718,2.274718,159058.705858,4.143936,...,0.025658,0.044786,0.017299,0.040703,0.039325,0.023739,0.030321,0.029677,0.033013,0.030373
std,13.128287,4.650583,1.137874,5.745501,3.514879,3.030222,8.799755,0.446375,78551.119303,2.762429,...,0.158114,0.206835,0.130383,0.197602,0.194367,0.152236,0.17147,0.169696,0.178672,0.171611
min,-53.0,1.0,1.0,0.0,0.0,-100.5,0.0,2.0,708.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-8.0,3.0,1.0,8.2,0.0,0.9,4.9,2.0,101249.0,2.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,6.0,2.0,12.4,1.0,1.6,14.3,2.0,201949.0,3.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,9.0,3.0,16.7,3.0,3.8,22.7,3.0,203077.0,5.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,53.0,36.0,7.0,24.0,32.0,24.9,43.5,3.0,530027.0,52.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
# Check the balance of our target values
y.value_counts()

missed    42479
made      35157
Name: SHOT_RESULT, dtype: int64

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train
#y_train

Unnamed: 0,FINAL_MARGIN,SHOT_NUMBER,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,...,Team_OKC,Team_ORL,Team_PHI,Team_PHO,Team_POR,Team_SAC,Team_SAS,Team_TOR,Team_UTA,Team_WAS
11118,-4,11,3,19.3,1,1.2,2.6,2,2734,2.0,...,0,0,0,0,0,0,0,0,0,0
31552,17,8,3,15.6,0,0.9,22.5,3,203463,7.3,...,0,0,0,0,0,0,0,1,0,0
65257,8,4,2,11.9,3,3.3,3.1,2,708,2.4,...,0,0,0,0,0,0,0,0,0,0
40464,-32,13,4,18.4,2,2.1,2.9,2,202684,1.8,...,0,1,0,0,0,0,0,0,0,0
81501,3,2,1,22.1,2,2.9,3.0,2,203077,2.1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28968,17,20,4,12.7,0,2.5,17.9,2,202696,4.8,...,0,0,0,0,0,0,0,0,0,0
30150,5,9,2,11.4,13,14.4,16.4,2,203112,3.3,...,0,0,0,0,0,0,0,1,0,0
115741,-26,3,1,17.2,0,1.1,1.4,2,2440,2.8,...,0,0,0,0,0,0,0,0,0,0
76510,12,7,3,18.6,0,-7.2,2.3,2,202693,2.5,...,0,0,0,0,0,0,0,0,0,0


## Oversampling methods

In [34]:
# Resample the training data with the RandomOversampler
#from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'missed': 31891, 'made': 31891})

In [35]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [36]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5399996475252008

In [37]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[4513, 4308],
       [4570, 6018]])

In [38]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       made       0.50      0.51      0.57      0.50      0.54      0.29      8821
     missed       0.58      0.57      0.51      0.58      0.54      0.29     10588

avg / total       0.54      0.54      0.54      0.54      0.54      0.29     19409



In [None]:
# How to quantify the teams that the player is on and who they are playing
# Stats of each team and ranking them
# If the player is on a good or bad offensive team
# adding 4 columns to our current data set
# adding the team ranking into a bin, team offensive rating, team defensive rating, team pace
# adding player effienciency

Add the other data set containing the sports csv that had all team stats, data needs to be parsed.