## Using resampling methods

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

## Read in CSV and perform basic cleaning

In [None]:
file_dir = '/Users/devingreenzang/desktop/challenges/Final/NBA_Final-Project/'
shot_log = pd.read_csv(f'{file_dir}shot_logs.csv')

In [None]:
file_dir = '/Users/devingreenzang/desktop/challenges/Final/NBA_Final-Project/'
players_log = pd.read_csv(f'{file_dir}players_stats.csv')

In [None]:
# Extracting date from MATCHUP column
shot_log['DATE'] = shot_log['MATCHUP'].str.split('-', expand = True)[0]

# Converting to datetime object
shot_log['DATE'] = pd.to_datetime(shot_log['DATE'])

# Turning gametime into datetime object
shot_log['GAME_CLOCK'] = pd.to_datetime(shot_log['GAME_CLOCK'], format = '%M:%S').dt.time 

# Adding date and time columns together
shot_log['TIMESTAMP'] = shot_log['DATE'].astype(str) + ' ' + shot_log['GAME_CLOCK'].astype(str)

# Converting to datetime object
shot_log['TIMESTAMP'] = pd.to_datetime(shot_log['TIMESTAMP'])

In [None]:
#converting the first letter in each string to uppercase 
shot_log['player_name'] = shot_log['player_name'].str.title()

In [None]:
shot_log.dtypes
#shot_log.drop(['DATE','TIMESTAMP'],axis =1)
shot_log = shot_log.dropna()

In [None]:
shot_log.dtypes
shot_log = shot_log.drop(['DATE','TIMESTAMP'],axis =1)

In [None]:
players_log.dtypes

In [None]:
merged_df=pd.merge(shot_log, players_log, how='left', left_on='player_name', right_on='Name')
display(merged_df)

In [None]:
merged_df = merged_df.dropna()

In [None]:
merged_df.dtypes

In [None]:
merged_df = merged_df.drop(['GAME_ID','AST/TOV','STL/TOV','Birth_Place','Birthdate'],axis =1)
merged_df.dtypes

## Below testing friday 18th of december with merged team data/defender heights

In [3]:
file_dir = '/Users/devingreenzang/desktop/challenges/Final/NBA_Final-Project/'
team_merge = pd.read_csv(f'{file_dir}team_merge.csv')

In [4]:
team_merge.columns

Index(['MATCHUP', 'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK', 'SHOT_DIST',
       'SHOT_RESULT', 'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID',
       'CLOSE_DEF_DIST', 'player_name', 'player_id', 'DATE', 'TIMESTAMP',
       'Name (Shooter)', 'Games Played', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'EFF', 'AST/TOV', 'STL/TOV', 'Age_x',
       'Birth_Place', 'Birthdate', 'Collage', 'Experience', 'Height (Shooter)',
       'Pos', 'Team', 'Weight', 'BMI', 'CD Last Name', 'CD First Name',
       'CD NAME', 'Name (CD)', 'Height (CD)', 'Rk', 'Age_y', 'W', 'L', 'PW',
       'PL', 'MOV', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr',
       '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1',
       'DRB%', 'FT/FGA.1', 'Arena', 'Attend.', 'Attend./G'],
      dtype='object')

In [5]:
team_merge = team_merge.dropna()
team_merge.columns

Index(['MATCHUP', 'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK', 'SHOT_DIST',
       'SHOT_RESULT', 'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID',
       'CLOSE_DEF_DIST', 'player_name', 'player_id', 'DATE', 'TIMESTAMP',
       'Name (Shooter)', 'Games Played', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'EFF', 'AST/TOV', 'STL/TOV', 'Age_x',
       'Birth_Place', 'Birthdate', 'Collage', 'Experience', 'Height (Shooter)',
       'Pos', 'Team', 'Weight', 'BMI', 'CD Last Name', 'CD First Name',
       'CD NAME', 'Name (CD)', 'Height (CD)', 'Rk', 'Age_y', 'W', 'L', 'PW',
       'PL', 'MOV', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr',
       '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1',
       'DRB%', 'FT/FGA.1', 'Arena', 'Attend.', 'Attend./G'],
      dtype='object')

In [6]:
team_merge = team_merge.drop(['DATE','TIMESTAMP','Arena','Attend./G'],axis=1)

## Split into test, training group -- looking to see if shot is missed or made

In [7]:
# Create our features
X = team_merge.drop("SHOT_RESULT", axis=1)

X = pd.get_dummies(X)

# Create our target
y = team_merge["SHOT_RESULT"]

In [8]:
X.describe()

Unnamed: 0,SHOT_NUMBER,PERIOD,SHOT_DIST,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,player_id,Games Played,MIN,PTS,FGM,...,"Attend._772,059","Attend._785,892","Attend._798,368","Attend._803,436","Attend._808,223","Attend._809,824","Attend._812,292","Attend._827,702","Attend._843,042","Attend._886,612"
count,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,...,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0,2177.0
mean,7.764814,4.0,14.105099,158700.677079,4.134175,155203.97198,69.813045,1751.937069,758.202113,280.994028,...,0.02848,0.06339,0.054662,0.052825,0.032614,0.039504,0.017915,0.021589,0.030776,0.050988
std,4.140878,0.0,8.638227,78374.736262,2.779716,82990.791225,10.374001,567.108592,363.327861,129.77799,...,0.166377,0.243719,0.227372,0.223735,0.177664,0.194835,0.132671,0.145372,0.172751,0.220023
min,1.0,4.0,0.1,708.0,0.0,1495.0,36.0,363.0,80.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,4.0,5.4,101187.0,2.3,101138.0,64.0,1274.0,470.0,182.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,4.0,15.4,201965.0,3.7,201961.0,72.0,1726.0,764.0,283.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10.0,4.0,22.8,203088.0,5.2,202710.0,77.0,2260.0,973.0,358.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,26.0,4.0,35.5,204079.0,46.8,203935.0,82.0,2981.0,2217.0,653.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y.value_counts()

missed    1215
made       962
Name: SHOT_RESULT, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train
#y_train

Unnamed: 0,SHOT_NUMBER,PERIOD,SHOT_DIST,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,player_id,Games Played,MIN,PTS,FGM,...,"Attend._772,059","Attend._785,892","Attend._798,368","Attend._803,436","Attend._808,223","Attend._809,824","Attend._812,292","Attend._827,702","Attend._843,042","Attend._886,612"
3907,8,4,3.3,101183,0.7,201196,64.0,1188.0,402.0,125.0,...,0,0,0,0,0,0,0,0,0,0
3514,5,4,5.8,201145,0.4,2210,74.0,1244.0,432.0,144.0,...,0,0,0,0,0,0,0,1,0,0
3879,9,4,8.0,101107,2.1,2743,64.0,1345.0,509.0,211.0,...,0,0,0,0,0,0,0,0,0,0
3423,13,4,0.4,2594,0.0,203504,76.0,2288.0,973.0,370.0,...,1,0,0,0,0,0,0,0,0,0
4030,9,4,5.3,202683,1.8,201585,81.0,1348.0,419.0,182.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,15,4,9.3,201171,1.6,202693,82.0,2581.0,1258.0,512.0,...,0,0,0,0,0,0,0,0,0,0
2399,8,4,14.7,202338,4.6,101138,82.0,1929.0,866.0,344.0,...,0,0,0,0,0,0,0,0,0,0
2717,4,4,21.2,202708,4.4,203917,73.0,1127.0,319.0,108.0,...,0,0,0,0,0,0,0,0,0,0
338,3,4,24.7,203521,6.8,2594,75.0,2418.0,911.0,292.0,...,0,0,0,0,0,0,0,0,0,0


## Oversampling methods

In [11]:
# Resample the training data with the RandomOversampler
#from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'made': 919, 'missed': 919})

In [12]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [13]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5468699120807554

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[147, 102],
       [147, 149]])

In [15]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       made       0.50      0.59      0.50      0.54      0.55      0.30       249
     missed       0.59      0.50      0.59      0.54      0.55      0.29       296

avg / total       0.55      0.54      0.55      0.54      0.55      0.30       545



In [None]:
# How to quantify the teams that the player is on and who they are playing
# Stats of each team and ranking them
# If the player is on a good or bad offensive team
# adding 4 columns to our current data set
# adding the team ranking into a bin, team offensive rating, team defensive rating, team pace
# adding player effienciency

Add the other data set containing the sports csv that had all team stats, data needs to be parsed.