### Merge Shot locations and play by play

In [10]:
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

# import files to merge
shot_locations = pd.read_csv("../data/processed/Shot_Locations_top_20_players_2000to2020.csv")
play_by_play = pd.read_csv("../data/processed/PlaybyPlay_2000-2020-v4.csv")


In [11]:
# merge
df_merged = shot_locations.merge(play_by_play, how='outer', left_on=['Game ID','Game Event ID'], right_on=['GAME_ID','EVENTNUM'])

# save as csv
df_merged.to_csv('../data/processed/PlaybyPlay_ShotLocations.csv')

In [3]:
print("Play by play shape:", play_by_play.shape)
print("Play by play games:", play_by_play['GAME_ID'].nunique())
print("Shot locations shape:", shot_locations.shape)
print("Shot locations games:", shot_locations['Game ID'].nunique())
print("Merged shaped:", df_merged.shape)

Play by play shape: (481976, 22)
Play by play games: 13929
Shot locations shape: (334331, 33)
Shot locations games: 13927
Merged shaped: (481976, 55)


In [4]:
# check if observations are missing on the play by play part
len(df_merged[df_merged['GAME_ID'].isna()])

0

In [5]:
# check if observations are missing on the shot location part
len(df_merged[df_merged['Game ID'].isna()])

138565

In [6]:
df_merged[df_merged['Game ID'].isna()].free_throw.value_counts(normalize=True)

free_throw
1.0    0.999459
0.0    0.000541
Name: proportion, dtype: float64

In [7]:
# > 99.5% of missing observations are free throws, we can try to add the shot locations manually

### Merge players stats per year

In [15]:
df_sl_pbp = pd.read_csv("../data/processed/PlaybyPlay_ShotLocations.csv", index_col=0)
df_players = pd.read_csv("../data/processed/stat_joueurs_streamlit.csv", index_col=0)

In [16]:
# Fill Year feature with data from complete lines 
games = df_sl_pbp[['GAME_ID', 'Year']].drop_duplicates().dropna()

for _, game in games.iterrows():
    df_sl_pbp.loc[df_sl_pbp.GAME_ID==game['GAME_ID'], 'Year']=game['Year']

In [17]:
df_sl_pbp.Year.isna().sum()

4

In [18]:
df_sl_pbp.head(2)

Unnamed: 0,Game ID,Game Event ID,Player ID,Player Name,Team ID,Team Name,Period,Shot Distance,X Location,Y Location,Shot Made Flag,Home Team,Away Team,Season Type,Year,Shot Zone Basic_Above the Break 3,Shot Zone Basic_Backcourt,Shot Zone Basic_In The Paint (Non-RA),Shot Zone Basic_Left Corner 3,Shot Zone Basic_Mid-Range,Shot Zone Basic_Restricted Area,Shot Zone Basic_Right Corner 3,Shot Zone Area_Back Court(BC),Shot Zone Area_Center(C),Shot Zone Area_Left Side Center(LC),Shot Zone Area_Left Side(L),Shot Zone Area_Right Side Center(RC),Shot Zone Area_Right Side(R),Shot Zone Range_16-24 ft.,Shot Zone Range_24+ ft.,Shot Zone Range_8-16 ft.,Shot Zone Range_Back Court Shot,Shot Zone Range_Less Than 8 ft.,GAME_ID,EVENTNUM,target,PERIOD,PLAYER1_NAME,PLAYER1_TEAM_ABBREVIATION,OPPONENT_TEAM,at_home,PREVIOUS_OFF_REBOUND,PREVIOUS_DEF_REBOUND,PREVIOUS_OFF_TURNOVER,PREVIOUS_OFF_MISSED,PREVIOUS_EVENTMSGTYPE,3PT,jump_shot,layup_shot,dunk_shot,hook_shot,free_throw,DETAILLED_SHOT_TYPE,minutes_left,seconds_left
0,20000001.0,11.0,947.0,Allen Iverson,1610613000.0,Philadelphia 76ers,1.0,19.0,-107.0,167.0,0.0,NYK,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001,11,0,1,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FREE_THROW,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,11,668
1,20000001.0,32.0,947.0,Allen Iverson,1610613000.0,Philadelphia 76ers,1.0,21.0,-115.0,177.0,1.0,NYK,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001,32,1,1,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7,477


In [19]:
# Check if all 20 players are there
df_players.groupby('Player')['Year'].count()

Player
Allen Iverson            11
Chris Webber              9
Dirk Nowitzki            20
Dwight Howard            16
Dwyane Wade              16
Giannis Antetokounmpo     7
James Harden             11
Jason Kidd               14
Jimmy Butler              9
Kawhi Leonard             9
Kevin Durant             12
Kevin Garnett            17
Kobe Bryant              17
LeBron James             17
Paul Pierce              18
Russell Westbrook        12
Shaquille O'Neal         12
Stephen Curry            11
Steve Nash               15
Tim Duncan               17
Name: Year, dtype: int64

In [12]:
df_sl_pbp.shape

(481976, 55)

In [None]:
# merge everything
data = df_sl_pbp.merge(df_players, how='left', left_on=['PLAYER1_NAME', 'Year'], right_on=['Player', 'Year'])

In [14]:
# drop duplicated columns
data = data.drop(['Game ID', 'Game Event ID', 'Player Name', 'Player', 'Team Name', 'Period', 'Player ID', 'Team ID' ], axis = 1)

### Deal with NAs

In [15]:
data.columns

Index(['Shot Distance', 'X Location', 'Y Location', 'Shot Made Flag',
       'Home Team', 'Away Team', 'Season Type', 'Year',
       'Shot Zone Basic_Above the Break 3', 'Shot Zone Basic_Backcourt',
       'Shot Zone Basic_In The Paint (Non-RA)',
       'Shot Zone Basic_Left Corner 3', 'Shot Zone Basic_Mid-Range',
       'Shot Zone Basic_Restricted Area', 'Shot Zone Basic_Right Corner 3',
       'Shot Zone Area_Back Court(BC)', 'Shot Zone Area_Center(C)',
       'Shot Zone Area_Left Side Center(LC)', 'Shot Zone Area_Left Side(L)',
       'Shot Zone Area_Right Side Center(RC)', 'Shot Zone Area_Right Side(R)',
       'Shot Zone Range_16-24 ft.', 'Shot Zone Range_24+ ft.',
       'Shot Zone Range_8-16 ft.', 'Shot Zone Range_Back Court Shot',
       'Shot Zone Range_Less Than 8 ft.', 'GAME_ID', 'EVENTNUM', 'target',
       'PERIOD', 'PLAYER1_NAME', 'PLAYER1_TEAM_ABBREVIATION', 'OPPONENT_TEAM',
       'at_home', 'PREVIOUS_OFF_REBOUND', 'PREVIOUS_DEF_REBOUND',
       'PREVIOUS_OFF_TURNOVER',

In [16]:
data.isna().sum()

Shot Distance                            138565
X Location                               138565
Y Location                               138565
Shot Made Flag                           138565
Home Team                                138565
Away Team                                138565
Season Type                              138565
Year                                          4
Shot Zone Basic_Above the Break 3        138565
Shot Zone Basic_Backcourt                138565
Shot Zone Basic_In The Paint (Non-RA)    138565
Shot Zone Basic_Left Corner 3            138565
Shot Zone Basic_Mid-Range                138565
Shot Zone Basic_Restricted Area          138565
Shot Zone Basic_Right Corner 3           138565
Shot Zone Area_Back Court(BC)            138565
Shot Zone Area_Center(C)                 138565
Shot Zone Area_Left Side Center(LC)      138565
Shot Zone Area_Left Side(L)              138565
Shot Zone Area_Right Side Center(RC)     138565
Shot Zone Area_Right Side(R)            

In [17]:
# update shot location for all free throws 
data.loc[data['free_throw']==1, 'Shot Zone Basic_Mid-Range'] = True
data.loc[data['free_throw']==1, 'Shot Zone Area_Center(C)'] = True
data.loc[data['free_throw']==1, 'Shot Zone Range_8-16 ft.'] = True
data.loc[data['free_throw']==1, 'Shot Distance'] = 15.0
data.loc[data['free_throw']==1, 'X Location'] = 0
data.loc[data['free_throw']==1, 'Y Location'] = 150

In [18]:
data.fillna({'Shot Zone Basic_Above the Break 3':False}, inplace=True)
data.fillna({'Shot Zone Basic_Backcourt': False}, inplace=True)
data.fillna({'Shot Zone Basic_In The Paint (Non-RA)': False}, inplace=True)
data.fillna({'Shot Zone Basic_Left Corner 3' : False}, inplace=True)
data.fillna({'Shot Zone Basic_Mid-Range' : False}, inplace=True)
data.fillna({'Shot Zone Basic_Restricted Area' : False}, inplace=True)
data.fillna({'Shot Zone Basic_Right Corner 3' : False}, inplace=True)
data.fillna({'Shot Zone Area_Back Court(BC)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Center(C)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Left Side Center(LC)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Left Side(L)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Right Side Center(RC)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Right Side(R)' : False}, inplace=True)
data.fillna({'Shot Zone Range_16-24 ft.' : False}, inplace=True)
data.fillna({'Shot Zone Range_24+ ft.' : False}, inplace=True)
data.fillna({'Shot Zone Range_8-16 ft.' : False}, inplace=True)
data.fillna({'Shot Zone Range_Back Court Shot' : False}, inplace=True)
data.fillna({'Shot Zone Range_Less Than 8 ft.' : False}, inplace=True)
data.fillna({"3P%":0}, inplace=True)


  data.fillna({'Shot Zone Basic_Above the Break 3':False}, inplace=True)
  data.fillna({'Shot Zone Basic_Backcourt': False}, inplace=True)
  data.fillna({'Shot Zone Basic_In The Paint (Non-RA)': False}, inplace=True)
  data.fillna({'Shot Zone Basic_Left Corner 3' : False}, inplace=True)
  data.fillna({'Shot Zone Basic_Mid-Range' : False}, inplace=True)
  data.fillna({'Shot Zone Basic_Restricted Area' : False}, inplace=True)
  data.fillna({'Shot Zone Basic_Right Corner 3' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Back Court(BC)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Center(C)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Left Side Center(LC)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Left Side(L)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Right Side Center(RC)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Right Side(R)' : False}, inplace=True)
  data.fillna({'Shot Zone Range_16-24 ft.' : False}, inplace=True)
  

In [19]:
# drop Shot Made Flag : target is the same with no NAs
data.drop('Shot Made Flag', axis=1, inplace = True)

# drop Home Team : PLAYER1_TEAM_ABBREVIATION is the same with no NAs
data.drop('Home Team', axis=1, inplace = True)

In [20]:
# Fill Season Type and Away Team with data from complete lines
games = data[['GAME_ID', 'Away Team', 'Season Type']].drop_duplicates().dropna()

for _, game in games.iterrows():
    data.loc[data.GAME_ID==game['GAME_ID'], 'Away Team']=game['Away Team']
    data.loc[data.GAME_ID==game['GAME_ID'], 'Season Type']=game['Season Type']

In [21]:
# check missing years for some players
data[data.Age.isna()][['Year', 'PLAYER1_NAME']].drop_duplicates().dropna()

Unnamed: 0,Year,PLAYER1_NAME
159437,2008.0,Chris Webber
200728,2010.0,Allen Iverson
223742,2011.0,Shaquille O'Neal
261703,2013.0,Jason Kidd
287169,2014.0,Steve Nash
322360,2016.0,Kevin Garnett
322408,2016.0,Tim Duncan
322992,2016.0,Kobe Bryant
347118,2017.0,Paul Pierce
377287,2019.0,Dirk Nowitzki


In [22]:
data.describe()

Unnamed: 0,Shot Distance,X Location,Y Location,Season Type,Year,GAME_ID,EVENTNUM,target,PERIOD,at_home,3PT,jump_shot,layup_shot,dunk_shot,hook_shot,free_throw,minutes_left,seconds_left,Age,TS%,PTM,ASTM,ORBM,STLM,BLKM,TOVM,USG%,FG%,2P%,3P%,FT%,PTS,year_start,year_end,height,weight,C,PF,PG,PG-SG,SF,SF-SG,SG,SG-PG
count,481901.0,481901.0,481901.0,481972.0,481972.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,481976.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,481976.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0,470495.0
mean,12.816798,-1.505058,102.242604,0.1353,2009.639369,23871950.0,260.629139,0.561833,2.506332,0.519729,0.146594,0.438455,0.159386,0.042523,0.022939,0.287338,4.975771,327.233698,28.39909,0.571727,23.038886,5.200773,1.462233,1.365051,0.891477,2.997611,28.934459,0.482006,0.513064,0.2914,0.784408,1639.148437,2002.08863,2016.258101,198.759269,230.435837,0.141234,0.199734,0.192927,0.004695,0.202534,0.00265,0.249935,0.006291
std,8.20941,85.037931,79.560224,0.342044,5.739282,7028466.0,162.014521,0.496162,1.145748,0.499611,0.353702,0.496198,0.366036,0.201779,0.149709,0.452521,3.387594,204.124417,4.329361,0.042106,5.835216,2.445007,0.913823,0.516588,0.682877,0.890281,4.952223,0.051545,0.051553,0.128459,0.107,551.483158,5.980087,2.699659,10.047883,31.298123,0.348263,0.399801,0.394596,0.06836,0.401888,0.051414,0.432975,0.079068
min,0.0,-250.0,-52.0,0.0,2000.0,20000000.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.456,3.210526,0.4,0.04,0.16,0.04878,0.352941,11.3,0.358,0.390667,0.0,0.417,35.0,1993.0,2008.0,182.88,165.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,-13.0,17.0,0.0,2005.0,20500940.0,115.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,153.0,25.0,0.545,18.830986,3.086957,0.772152,0.939394,0.375,2.4,25.5,0.449,0.482,0.266,0.738,1307.0,1997.0,2016.0,192.024,212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15.0,0.0,134.0,0.0,2010.0,21100250.0,261.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,317.0,28.0,0.574,23.953846,4.970588,1.169014,1.378049,0.710145,2.987805,29.6,0.476,0.506,0.327,0.811,1735.0,1999.0,2018.0,198.12,235.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17.0,11.0,150.0,0.0,2014.0,21700540.0,384.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,8.0,497.0,31.0,0.601,27.2,6.941176,1.79021,1.759494,1.215686,3.517241,32.2,0.506,0.54,0.374,0.86,2036.0,2008.0,2018.0,207.264,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,87.0,250.0,867.0,1.0,2020.0,49900090.0,872.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,720.0,40.0,0.696,36.128205,11.631579,4.253165,2.8,2.925926,5.728395,41.7,0.729,0.732,0.6,1.0,2832.0,2014.0,2018.0,216.408,325.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
data.shape

(481976, 72)

In [24]:
data.isna().sum()

Shot Distance                               75
X Location                                  75
Y Location                                  75
Away Team                                    4
Season Type                                  4
Year                                         4
Shot Zone Basic_Above the Break 3            0
Shot Zone Basic_Backcourt                    0
Shot Zone Basic_In The Paint (Non-RA)        0
Shot Zone Basic_Left Corner 3                0
Shot Zone Basic_Mid-Range                    0
Shot Zone Basic_Restricted Area              0
Shot Zone Basic_Right Corner 3               0
Shot Zone Area_Back Court(BC)                0
Shot Zone Area_Center(C)                     0
Shot Zone Area_Left Side Center(LC)          0
Shot Zone Area_Left Side(L)                  0
Shot Zone Area_Right Side Center(RC)         0
Shot Zone Area_Right Side(R)                 0
Shot Zone Range_16-24 ft.                    0
Shot Zone Range_24+ ft.                      0
Shot Zone Ran

## Add defensive rate of opponent team

In [25]:
metrics = pd.read_csv("../data/raw/team_metrics.csv", index_col=0)

# Ajout du defensive rate de l'équipe opposée
data = data.merge(metrics[['ABBREVIATION', 'Year', 'E_DEF_RATING']], left_on=['OPPONENT_TEAM', 'Year'], right_on=['ABBREVIATION', 'Year'])
data.drop('ABBREVIATION', axis=1, inplace=True)

# Ajout de l'offensive rate de l'équipe qui tire
data = data.merge(metrics[['ABBREVIATION', 'Year', 'E_OFF_RATING']], left_on=['PLAYER1_TEAM_ABBREVIATION', 'Year'], right_on=['ABBREVIATION', 'Year'])
data.drop('ABBREVIATION', axis=1, inplace=True)

## Export 

In [26]:
data.dropna(inplace=True)
data.to_csv('../data/processed/all_shots-v6.csv')

## Add pct depending on the previous action and pct depending on the area

In [11]:
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

shots = pd.read_csv("../data/processed/all_shots-v6.csv", index_col=0)

In [14]:
# import datasets
pct_area = pd.read_csv("../data/processed/pourcentage_par_zone.csv", index_col=0)
pct_action = pd.read_csv("../data/processed/pourcentage_par_action_precedente.csv", index_col=0)
pct_action.dropna(inplace=True)
pct_area.dropna(inplace=True)

# get dummies to match the columns' names of shots
pct_action = pd.concat([pct_action, pd.get_dummies(pct_action.PREVIOUS, prefix="PREVIOUS")], axis=1)
pct_area = pd.concat([pct_area, pd.get_dummies(pct_area["Shot Zone"], prefix="Shot Zone", prefix_sep=" ")], axis=1)

# drop unneeded columns
pct_action.drop(['PREVIOUS','Total_Target',	'Count'], axis=1, inplace=True)
pct_area.drop(["Shot Zone",	"Total_Target",	"Count"], axis=1, inplace=True)

# rename PCT column
pct_action.rename({'Pourcentage':'PCT_PREV_ACTION'}, axis=1, inplace=True)
pct_area.rename({'Pourcentage':'PCT_AREA'}, axis=1, inplace=True)

# merge new columns with the shots dataframe
shots = pd.merge(left=shots, right=pct_action, how='left', on=["PLAYER1_NAME",	"Year",	"PREVIOUS_DEF_REBOUND",	"PREVIOUS_OFF_MISSED",	"PREVIOUS_OFF_REBOUND"])
shots = pd.merge(left=shots, right=pct_area, how='left', on=["PLAYER1_NAME", "Year", "Shot Zone Basic_Above the Break 3", 
                                                     "Shot Zone Basic_Backcourt", "Shot Zone Basic_In The Paint (Non-RA)", 
                                                     "Shot Zone Basic_Left Corner 3", "Shot Zone Basic_Mid-Range", 
                                                     "Shot Zone Basic_Restricted Area", "Shot Zone Basic_Right Corner 3"])

shots.head()

Unnamed: 0,Shot Distance,X Location,Y Location,Away Team,Season Type,Year,Shot Zone Basic_Above the Break 3,Shot Zone Basic_Backcourt,Shot Zone Basic_In The Paint (Non-RA),Shot Zone Basic_Left Corner 3,Shot Zone Basic_Mid-Range,Shot Zone Basic_Restricted Area,Shot Zone Basic_Right Corner 3,Shot Zone Area_Back Court(BC),Shot Zone Area_Center(C),Shot Zone Area_Left Side Center(LC),Shot Zone Area_Left Side(L),Shot Zone Area_Right Side Center(RC),Shot Zone Area_Right Side(R),Shot Zone Range_16-24 ft.,Shot Zone Range_24+ ft.,Shot Zone Range_8-16 ft.,Shot Zone Range_Back Court Shot,Shot Zone Range_Less Than 8 ft.,GAME_ID,EVENTNUM,target,PERIOD,PLAYER1_NAME,PLAYER1_TEAM_ABBREVIATION,OPPONENT_TEAM,at_home,PREVIOUS_OFF_REBOUND,PREVIOUS_DEF_REBOUND,PREVIOUS_OFF_TURNOVER,PREVIOUS_OFF_MISSED,PREVIOUS_EVENTMSGTYPE,3PT,jump_shot,layup_shot,dunk_shot,hook_shot,free_throw,DETAILLED_SHOT_TYPE,minutes_left,seconds_left,Age,TS%,PTM,ASTM,ORBM,STLM,BLKM,TOVM,USG%,FG%,2P%,3P%,FT%,PTS,year_start,year_end,height,weight,C,PF,PG,PG-SG,SF,SF-SG,SG,SG-PG,E_DEF_RATING,E_OFF_RATING,YEARS_EXP,PCT_PREV_ACTION,PCT_AREA
0,19.0,-107.0,167.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001,11,0,1,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FREE_THROW,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,11,668,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,182.88,165.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.5,100.7,3.0,,35.089974
1,21.0,-115.0,177.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001,32,1,1,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7,477,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,182.88,165.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.5,100.7,3.0,,35.089974
2,16.0,165.0,13.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,20000001,34,0,1,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7,443,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,182.88,165.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.5,100.7,3.0,,35.089974
3,15.0,0.0,150.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,20000001,39,1,1,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FOUL,0.0,0.0,0.0,0.0,0.0,1.0,FREE THROW,7,432,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,182.88,165.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.5,100.7,3.0,,35.089974
4,15.0,0.0,150.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,20000001,40,1,1,Allen Iverson,PHI,NYK,0.0,False,False,False,True,FREE_THROW,0.0,0.0,0.0,0.0,0.0,1.0,FREE THROW,7,432,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,182.88,165.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.5,100.7,3.0,76.724138,35.089974


In [15]:
# pourcentage of free throws made per player
free_throws = shots[shots.free_throw==True]
pct_free_throw = free_throws.groupby(['PLAYER1_NAME', 'Year']).agg({"target":"mean"}).reset_index()
pct_free_throw.rename({"target":"PCT_FREE_THROW"}, axis=1, inplace=True)

# merge data to the final DF
shots = pd.merge(left=shots, right=pct_free_throw, how='left', on=["PLAYER1_NAME", "Year"])

# replace the PCT_AREA with PCT FREE THROW and drop column
shots.loc[shots.free_throw==True, 'PCT_AREA'] = shots.loc[shots.free_throw==True, 'PCT_FREE_THROW']
shots.drop("PCT_FREE_THROW", axis=1, inplace=True)

# replace NA in PCT_PREV_ACTION with the mean
shots.PCT_PREV_ACTION.fillna(shots.PCT_PREV_ACTION.mean(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  shots.PCT_PREV_ACTION.fillna(shots.PCT_PREV_ACTION.mean(), inplace=True)


In [22]:
shots.loc[shots['PCT_AREA']<1, 'PCT_AREA'] = shots.loc[shots['PCT_AREA']<1, 'PCT_AREA'] * 100

## Add years of experience

In [23]:
shots['YEARS_EXP'] = shots['Year']-shots['year_start']

In [24]:
shots.to_csv('../data/processed/all_shots-v6.csv')

In [31]:
import pandas as pd


BEST_PLAYERS =['Kobe Bryant', 'LeBron James', 'Stephen Curry', 'Kevin Durant', 'Dwyane Wade', 'Dirk Nowitzki', 'Tim Duncan', "Shaquille O'Neal", "Steve Nash", "Kawhi Leonard", "James Harden", "Jason Kidd", "Allen Iverson", "Chris Webber", "Kevin Garnett", "Paul Pierce", "Giannis Antetokounmpo", "Jimmy Butler", "Russell Westbrook", "Dwight Howard"]
PLAYERS_DICT = {
    'K. Bryant':'Kobe Bryant', 
    'L. James':'LeBron James', 
    'S. Curry':'Stephen Curry', 
    'K. Durant':'Kevin Durant', 
    'D. Wade':'Dwyane Wade', 
    'D. Nowitzki':'Dirk Nowitzki', 
    'T. Duncan':'Tim Duncan', 
    "S. O'Neal":"Shaquille O'Neal", 
    'S. Nash':"Steve Nash", 
    'K. Leonard':"Kawhi Leonard", 
    'J. Harden':"James Harden", 
    'J. Kidd':"Jason Kidd", 
    'A. Iverson':"Allen Iverson", 
    'C. Webber':"Chris Webber", 
    'K. Garnett':"Kevin Garnett", 
    'P. Pierce':"Paul Pierce", 
    'G. Antetokounmpo':"Giannis Antetokounmpo", 
    'J. Butler':"Jimmy Butler", 
    'R. Westbrook':"Russell Westbrook", 
    'D. Howard':"Dwight Howard"
}
def drop_players(data):
    """ Detele players that are not part of the study
    Args: 
        data : DataFrame
    Returns:
        data : DataFrame
    """
    data = data[data.PLAYER1_NAME.isin(BEST_PLAYERS)]

    return data


def drop_actions(data):
    """ Delete actions that are not shots (rebounds, violations, fouls, timeouts, etc.)
    Args: 
        data : DataFrame
    Returns:
        data : DataFrame
    """
    data = data[data.EVENTMSGTYPE < 4]
    return data 

def update_freethrow_outcome(data):
    """ Update the outcome of a free throw, checks if SCORE is updated. The score is updated only if the goal is made
    Args: 
        data : DataFrame
    Returns:
        data : DataFrame
    """      
    data.loc[(data.EVENTMSGTYPE==3) & (data.SCORE.isna()), 'EVENTMSGTYPE'] = 2  # MISSED
    data.loc[(data.EVENTMSGTYPE==3) & (data.SCORE.notna()), 'EVENTMSGTYPE'] = 1 # MADE
    return data


def create_3pt_feature(data):
    data.loc[(data.HOMEDESCRIPTION.str.contains('3PT')) | (data.VISITORDESCRIPTION.str.contains('3PT')), '3PT'] = 1
    data.loc[data['3PT'].isna(), '3PT'] = 0
    return data

def create_jumpshot_feature(data):    
    data.loc[(data.HOMEDESCRIPTION.str.contains('Jump Shot')) | (data.VISITORDESCRIPTION.str.contains('Jump Shot')), 'jump_shot'] = 1
    data.loc[data['jump_shot'].isna(), 'jump_shot'] = 0
    return data

def create_layup_feature(data):    
    data.loc[(data.HOMEDESCRIPTION.str.contains('Layup')) | (data.VISITORDESCRIPTION.str.contains('Layup')), 'layup_shot'] = 1
    data.loc[data['layup_shot'].isna(), 'layup_shot'] = 0
    return data

def create_dunk_feature(data):    
    data.loc[(data.HOMEDESCRIPTION.str.contains('Dunk')) | (data.VISITORDESCRIPTION.str.contains('Dunk')), 'dunk_shot'] = 1
    data.loc[data['dunk_shot'].isna(), 'dunk_shot'] = 0
    return data

def create_hook_feature(data):    
    data.loc[(data.HOMEDESCRIPTION.str.contains('Hook')) | (data.VISITORDESCRIPTION.str.contains('Hook')), 'hook_shot'] = 1
    data.loc[data['hook_shot'].isna(), 'hook_shot'] = 0
    return data

def create_freethrow_feature(data):    
    data.loc[(data.HOMEDESCRIPTION.str.contains('Free Throw')) | (data.VISITORDESCRIPTION.str.contains('Free Throw')), 'free_throw'] = 1
    data.loc[data['free_throw'].isna(), 'free_throw'] = 0
    return data

def create_previous_actions_features(data):
    # previous action is an offensive rebound
    data.loc[(data['EVENTMSGTYPE'] < 4) & 
       (data['PLAYER1_TEAM_ID'] == data['PLAYER1_TEAM_ID'].shift(1))  & 
       (data['PLAYER1_TEAM_ID'].shift(1) == data['PLAYER1_TEAM_ID'].shift(2)) & 
       (~data['EVENTMSGTYPE'].shift(1).isin([6,8,10])), 'PREVIOUS_OFF_REBOUND'] = True 
       
    data['PREVIOUS_OFF_REBOUND'] = data['PREVIOUS_OFF_REBOUND'].fillna(False)

    # previous action is a defensive rebound
    data.loc[(data['EVENTMSGTYPE'] < 4) & 
        (data['PLAYER1_TEAM_ID'] == data['PLAYER1_TEAM_ID'].shift(1))  & 
        (data['PLAYER1_TEAM_ID'].shift(1) != data['PLAYER1_TEAM_ID'].shift(2)) & 
        (data['EVENTMSGTYPE'].shift(1) == 4), 'PREVIOUS_DEF_REBOUND'] = True

    data['PREVIOUS_DEF_REBOUND'] = data['PREVIOUS_DEF_REBOUND'].fillna(False)

    # previous action is a turnover
    data.loc[(data['EVENTMSGTYPE'] < 4) & 
        (data['PLAYER1_TEAM_ID'] == data['PLAYER1_TEAM_ID'].shift(1))  & 
        (data['EVENTMSGTYPE'].shift(1) == 5), 'PREVIOUS_OFF_TURNOVER'] = True 
        
    data['PREVIOUS_OFF_TURNOVER'] = data['PREVIOUS_OFF_TURNOVER'].fillna(False)

    # previous action is a field goal missed or free throw
    data.loc[(data['EVENTMSGTYPE'] < 4) & 
        (data['PLAYER1_TEAM_ID'] == data['PLAYER1_TEAM_ID'].shift(1))  & 
        (data['EVENTMSGTYPE'].shift(1).isin([2, 3])), 'PREVIOUS_OFF_MISSED'] = True 
        
    data['PREVIOUS_OFF_MISSED'] = data['PREVIOUS_OFF_MISSED'].fillna(False)

    # Explain Previous actions details
    eventmsgtypes = {
        1: "FIELD_GOAL_MADE",
        2 : "FIELD_GOAL_MISSED",
        3 : "FREE_THROW",
        4 : "REBOUND",
        5 : "TURNOVER",
        6 : "FOUL",
        7 : "VIOLATION",
        8 : "SUBSTITUTION",
        9 : "TIMEOUT",
        10 : "JUMP_BALL",
        11 : "EJECTION" ,
        12 : "PERIOD_BEGIN" ,
        13 : "PERIOD_END" 
    }

    data['PREVIOUS_EVENTMSGTYPE'] = data.EVENTMSGTYPE.shift(1)
    data.PREVIOUS_EVENTMSGTYPE = data.PREVIOUS_EVENTMSGTYPE.replace(eventmsgtypes)

    return data

def update_shot_type(data):    
    data.loc[(data.DETAILLED_SHOT_TYPE != 'JUMP SHOT') & (data.DETAILLED_SHOT_TYPE != "FREE THROW"), 'DETAILLED_SHOT_TYPE'] = 'OTHER'
    return data

def detail_shot_type(data):
    eventnames= {
        102:'3PT DRIVING FLOATING BANK JUMP SHOT',
        101:'3PT DRIVING FLOATING JUMP SHOT',
        63:'3PT FADEAWAY JUMPER',
        78:'3PT FLOATING JUMP SHOT',
        66:'3PT JUMP BANK SHOT',
        1:'3PT JUMP SHOT',
        10:'FREE THROW',
        11:'FREE THROW',
        12:'FREE THROW',
        13:'FREE THROW',
        14:'FREE THROW',
        15:'FREE THROW',
        18:'FREE THROW',
        19:'FREE THROW',
        56:'RUNNING HOOK SHOT',
        59:'FINGER ROLL',
        40:'LAYUP',
        45:'JUMP SHOT',
        61:'DRIVING FINGER ROLL',
        79:'3PT PULLUP JUMP SHOT',
        2:'3PT RUNNING JUMP SHOT',
        103:'3PT RUNNING PULL',
        104:'3PT STEP BACK BANK JUMP SHOT',
        80:'3PT STEP BACK JUMP SHOT',
        86:'3PT TURNAROUND FADEAWAY',
        105:'3PT TURNAROUND FADEAWAY BANK JUMP SHOT',
        47:'3PT TURNAROUND JUMP SHOT',
        52:'ALLEY OOP DUNK',
        43:'ALLEY OOP LAYUP',
        108:'CUTTING DUNK SHOT',
        99:'CUTTING FINGER ROLL LAYUP SHOT',
        98:'CUTTING LAYUP SHOT',
        93:'DRIVING BANK HOOK SHOT',
        9:'DRIVING DUNK',
        75:'DRIVING FINGER ROLL LAYUP',
        102:'DRIVING FLOATING BANK JUMP SHOT',
        101:'DRIVING FLOATING JUMP SHOT',
        57:'DRIVING HOOK SHOT',
        6:'DRIVING LAYUP',
        109:'DRIVING REVERSE DUNK SHOT',
        73:'DRIVING REVERSE LAYUP',
        7:'DUNK',
        63:'FADEAWAY JUMPER',
        71:'FINGER ROLL LAYUP',
        78:'FLOATING JUMP SHOT',
        67:'HOOK BANK SHOT',
        3:'HOOK SHOT',
        66:'JUMP BANK SHOT',
        1:'JUMP SHOT',
        5:'LAYUP',
        79:'PULLUP JUMP SHOT',
        87:'PUTBACK DUNK',
        72:'PUTBACK LAYUP',
        51:'REVERSE DUNK',
        44:'REVERSE LAYUP',
        106:'RUNNING ALLEY OOP DUNK SHOT',
        100:'RUNNING ALLEY OOP LAYUP SHOT',
        50:'RUNNING DUNK',
        76:'RUNNING FINGER ROLL LAYUP',
        2:'RUNNING JUMP SHOT',
        41:'RUNNING LAYUP',
        103:'RUNNING PULL',
        110:'RUNNING REVERSE DUNK SHOT',
        74:'RUNNING REVERSE LAYUP',
        104:'STEP BACK BANK JUMP SHOT',
        80:'STEP BACK JUMP SHOT',
        107:'TIP DUNK SHOT',
        97:'TIP LAYUP SHOT',
        96:'TURNAROUND BANK HOOK SHOT',
        86:'TURNAROUND FADEAWAY',
        105:'TURNAROUND FADEAWAY BANK JUMP SHOT',
        58:'TURNAROUND HOOK SHOT',
        47:'TURNAROUND JUMP SHOT',
        42:'LAYUP',
        49:'DRIVING_DUNK',
        46:'RUNNING_JUMP_SHOT',
        8:'SLAM_DUNK',
        4:'TIP_SHOT',
        16: "FREE THROW",
        17: "FREE THROW",
        55: "HOOK SHOT",                         
        48: "DUNK SHOT",                        
        60: "RUNNING FINGER ROLL",                         
        53: "TIP SHOT",
    }

    data["DETAILLED_SHOT_TYPE"] = data.EVENTMSGACTIONTYPE.replace(eventnames)

    return data

def clean_data(data):

    # add feature "opponent team"
    games = data[['GAME_ID', 'PLAYER1_TEAM_ABBREVIATION']].dropna().drop_duplicates()
    for _, game in games.iterrows():
        data.loc[(data.GAME_ID == game.GAME_ID) & (data.PLAYER1_TEAM_ABBREVIATION != game.PLAYER1_TEAM_ABBREVIATION), 'OPPONENT_TEAM'] = game.PLAYER1_TEAM_ABBREVIATION

    # Change index
    data = data.set_index(['GAME_ID', 'EVENTNUM'])

    # Create feature at_home to see if players perform better when they're at home
    data.loc[data.HOMEDESCRIPTION.notna(), 'at_home'] = 1
    data.loc[data.HOMEDESCRIPTION.isna(), 'at_home'] = 0

    data.loc[data.HOMEDESCRIPTION.isna(), 'HOMEDESCRIPTION'] = ''
    data.loc[data.VISITORDESCRIPTION.isna(), 'VISITORDESCRIPTION'] = ''

    data = create_previous_actions_features(data)

    data = drop_players(data)
    data = drop_actions(data)
    data = update_freethrow_outcome(data)
    
    # Transform EVENTMSGTYPE to have 0 = MISSED and 1 = MADE
    data.loc[data.EVENTMSGTYPE==2, 'EVENTMSGTYPE'] = 0
    
    # Create features from HOMEDESCRIPTION and VISITORDESCRIPTION
    data = create_3pt_feature(data)
    data = create_jumpshot_feature(data)
    data = create_layup_feature(data)
    data = create_dunk_feature(data)
    data = create_hook_feature(data)
    data = create_freethrow_feature(data)
    data = detail_shot_type(data)

    # Transform PCTIMESTRING as time
    data.PCTIMESTRING = pd.to_datetime(data.PCTIMESTRING, format="%M:%S") 

    # Create features from PCTIMESTRING
    data['minutes_left'] = data['PCTIMESTRING'].dt.minute
    data['seconds_left'] = data['PCTIMESTRING'].dt.minute*60 + data['PCTIMESTRING'].dt.second
    
    # Drop columns    
    data = data.drop(['HOMEDESCRIPTION', 'NEUTRALDESCRIPTION', 'PERSON2TYPE', 'PERSON3TYPE', 'PLAYER2_ID', 'PLAYER2_NAME', 'PLAYER2_TEAM_ABBREVIATION', \
                'PLAYER2_TEAM_CITY', 'PLAYER2_TEAM_ID', 'PLAYER1_TEAM_NICKNAME', 'PLAYER1_ID', 'PLAYER1_TEAM_CITY', 'PLAYER1_TEAM_ID','PLAYER2_TEAM_NICKNAME', \
                'PLAYER3_ID', 'PLAYER3_NAME', 'PLAYER3_TEAM_ABBREVIATION', 'PLAYER3_TEAM_CITY', 'PLAYER3_TEAM_ID', 'PLAYER3_TEAM_NICKNAME', 'SCORE', \
                'SCOREMARGIN', 'VISITORDESCRIPTION', 'WCTIMESTRING', 'EVENTMSGACTIONTYPE', 'PCTIMESTRING', 'PERSON1TYPE'], axis = 1)

    # Rename target column
    data = data.rename({'EVENTMSGTYPE': 'target'}, axis=1)

    data = update_shot_type(data)

    return data

In [44]:
# Load data
pct_area = pd.read_csv("../data/processed/pourcentage_par_zone.csv", index_col=0)
pct_action = pd.read_csv("../data/processed/pourcentage_par_action_precedente.csv", index_col=0)

shot_locations = pd.read_csv("../data/processed/Shot_Locations_top_20_players_2000to2020.csv")
df_players = pd.read_csv("../data/processed/stat_joueurs_streamlit.csv", index_col=0)
metrics = pd.read_csv("../data/raw/team_metrics.csv", index_col=0)
pct_action.dropna(inplace=True)
pct_area.dropna(inplace=True)

In [45]:


# Clean all datasets from 2000 to 2019
# files = ['2000-01_pbp.csv','2001-02_pbp.csv','2002-03_pbp.csv','2003-04_pbp.csv','2004-05_pbp.csv',
#         '2005-06_pbp.csv','2006-07_pbp.csv','2007-08_pbp.csv','2008-09_pbp.csv','2009-10_pbp.csv',
#         '2010-11_pbp.csv','2011-12_pbp.csv','2012-13_pbp.csv','2013-14_pbp.csv','2014-15_pbp.csv',
#         '2015-16_pbp.csv','2016-17_pbp.csv','2017-18_pbp.csv','2018-19_pbp.csv', 'missing_pbp_2019-2020.csv',
#         'missing_pbp.csv']

files = ['2000-01_pbp.csv']
all_data = []

for file in files:
    data = pd.read_csv("../data/raw/" + file, index_col=0)    
    all_data.append(clean_data(data))


# Concat all play by play data
all_plays = pd.concat(all_data)

if 'VIDEO_AVAILABLE_FLAG' in all_plays.columns:
    all_plays.drop('VIDEO_AVAILABLE_FLAG', axis=1, inplace=True)

all_plays.reset_index(inplace=True)

  data['PREVIOUS_OFF_REBOUND'] = data['PREVIOUS_OFF_REBOUND'].fillna(False)
  data['PREVIOUS_DEF_REBOUND'] = data['PREVIOUS_DEF_REBOUND'].fillna(False)
  data['PREVIOUS_OFF_TURNOVER'] = data['PREVIOUS_OFF_TURNOVER'].fillna(False)
  data['PREVIOUS_OFF_MISSED'] = data['PREVIOUS_OFF_MISSED'].fillna(False)


In [46]:
# Merge shot location    
df_merged = shot_locations.merge(all_plays, how='outer', left_on=['Game ID','Game Event ID'], right_on=['GAME_ID','EVENTNUM'])

# Fill Year feature with data from complete lines 
games = df_merged[['GAME_ID', 'Year']].drop_duplicates().dropna()

for _, game in games.iterrows():
    df_merged.loc[df_merged.GAME_ID==game['GAME_ID'], 'Year']=game['Year']

# Keep only the first 4 caracters to have the year
df_players['Year'] = df_players['Year'].astype(str).str.slice(0, 4).astype(int)

# Merge player stats
data = df_merged.merge(df_players, how='left', left_on=['PLAYER1_NAME', 'Year'], right_on=['Player', 'Year'])

# drop duplicated columns
data = data.drop(['Game ID', 'Game Event ID', 'Player Name', 'Player', 'Team Name', 'Period', 'Player ID', 'Team ID' ], axis = 1)
# update shot location for all free throws 
data.loc[data['free_throw']==1, 'Shot Zone Basic_Mid-Range'] = True
data.loc[data['free_throw']==1, 'Shot Zone Area_Center(C)'] = True
data.loc[data['free_throw']==1, 'Shot Zone Range_8-16 ft.'] = True
data.loc[data['free_throw']==1, 'Shot Distance'] = 15.0
data.loc[data['free_throw']==1, 'X Location'] = 0
data.loc[data['free_throw']==1, 'Y Location'] = 150

data.fillna({'Shot Zone Basic_Above the Break 3':False}, inplace=True)
data.fillna({'Shot Zone Basic_Backcourt': False}, inplace=True)
data.fillna({'Shot Zone Basic_In The Paint (Non-RA)': False}, inplace=True)
data.fillna({'Shot Zone Basic_Left Corner 3' : False}, inplace=True)
data.fillna({'Shot Zone Basic_Mid-Range' : False}, inplace=True)
data.fillna({'Shot Zone Basic_Restricted Area' : False}, inplace=True)
data.fillna({'Shot Zone Basic_Right Corner 3' : False}, inplace=True)
data.fillna({'Shot Zone Area_Back Court(BC)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Center(C)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Left Side Center(LC)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Left Side(L)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Right Side Center(RC)' : False}, inplace=True)
data.fillna({'Shot Zone Area_Right Side(R)' : False}, inplace=True)
data.fillna({'Shot Zone Range_16-24 ft.' : False}, inplace=True)
data.fillna({'Shot Zone Range_24+ ft.' : False}, inplace=True)
data.fillna({'Shot Zone Range_8-16 ft.' : False}, inplace=True)
data.fillna({'Shot Zone Range_Back Court Shot' : False}, inplace=True)
data.fillna({'Shot Zone Range_Less Than 8 ft.' : False}, inplace=True)
data.fillna({"3P%":0}, inplace=True)

# drop Shot Made Flag : target is the same with no NAs
data.drop('Shot Made Flag', axis=1, inplace = True)

# drop Home Team : PLAYER1_TEAM_ABBREVIATION is the same with no NAs
data.drop('Home Team', axis=1, inplace = True)

  data.fillna({'Shot Zone Basic_Above the Break 3':False}, inplace=True)
  data.fillna({'Shot Zone Basic_Backcourt': False}, inplace=True)
  data.fillna({'Shot Zone Basic_In The Paint (Non-RA)': False}, inplace=True)
  data.fillna({'Shot Zone Basic_Left Corner 3' : False}, inplace=True)
  data.fillna({'Shot Zone Basic_Mid-Range' : False}, inplace=True)
  data.fillna({'Shot Zone Basic_Restricted Area' : False}, inplace=True)
  data.fillna({'Shot Zone Basic_Right Corner 3' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Back Court(BC)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Center(C)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Left Side Center(LC)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Left Side(L)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Right Side Center(RC)' : False}, inplace=True)
  data.fillna({'Shot Zone Area_Right Side(R)' : False}, inplace=True)
  data.fillna({'Shot Zone Range_16-24 ft.' : False}, inplace=True)
  

In [47]:
data.head()

Unnamed: 0,Shot Distance,X Location,Y Location,Away Team,Season Type,Year,Shot Zone Basic_Above the Break 3,Shot Zone Basic_Backcourt,Shot Zone Basic_In The Paint (Non-RA),Shot Zone Basic_Left Corner 3,Shot Zone Basic_Mid-Range,Shot Zone Basic_Restricted Area,Shot Zone Basic_Right Corner 3,Shot Zone Area_Back Court(BC),Shot Zone Area_Center(C),Shot Zone Area_Left Side Center(LC),Shot Zone Area_Left Side(L),Shot Zone Area_Right Side Center(RC),Shot Zone Area_Right Side(R),Shot Zone Range_16-24 ft.,Shot Zone Range_24+ ft.,Shot Zone Range_8-16 ft.,Shot Zone Range_Back Court Shot,Shot Zone Range_Less Than 8 ft.,GAME_ID,EVENTNUM,target,PERIOD,PLAYER1_NAME,PLAYER1_TEAM_ABBREVIATION,OPPONENT_TEAM,at_home,PREVIOUS_OFF_REBOUND,PREVIOUS_DEF_REBOUND,PREVIOUS_OFF_TURNOVER,PREVIOUS_OFF_MISSED,PREVIOUS_EVENTMSGTYPE,3PT,jump_shot,layup_shot,dunk_shot,hook_shot,free_throw,DETAILLED_SHOT_TYPE,minutes_left,seconds_left,Age,TS%,PTM,ASTM,ORBM,STLM,BLKM,TOVM,USG%,FG%,2P%,3P%,FT%,PTS,year_start,year_end,position,height,weight,birth_date,college,C,PF,PG,PG-SG,SF,SF-SG,SG,SG-PG,Team
0,19.0,-107.0,167.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001.0,11.0,0.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FREE_THROW,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,11.0,668.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI
1,21.0,-115.0,177.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001.0,32.0,1.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7.0,477.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI
2,16.0,165.0,13.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,20000001.0,34.0,0.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7.0,443.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI
3,15.0,0.0,150.0,,,2000.0,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,20000001.0,39.0,1.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FOUL,0.0,0.0,0.0,0.0,0.0,1.0,FREE THROW,7.0,432.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI
4,15.0,0.0,150.0,,,2000.0,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,20000001.0,40.0,1.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,True,FREE_THROW,0.0,0.0,0.0,0.0,0.0,1.0,FREE THROW,7.0,432.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI


In [48]:
# Merge team stats
# Fill Season Type and Away Team with data from complete lines
games = data[['GAME_ID', 'Away Team', 'Season Type']].drop_duplicates().dropna()

for _, game in games.iterrows():
    data.loc[data.GAME_ID==game['GAME_ID'], 'Away Team']=game['Away Team']
    data.loc[data.GAME_ID==game['GAME_ID'], 'Season Type']=game['Season Type']

# Ajout du defensive rate de l'équipe opposée
data = data.merge(metrics[['ABBREVIATION', 'Year', 'E_DEF_RATING']], left_on=['OPPONENT_TEAM', 'Year'], right_on=['ABBREVIATION', 'Year'])
data.drop('ABBREVIATION', axis=1, inplace=True)

# Ajout de l'offensive rate de l'équipe qui tire
data = data.merge(metrics[['ABBREVIATION', 'Year', 'E_OFF_RATING']], left_on=['PLAYER1_TEAM_ABBREVIATION', 'Year'], right_on=['ABBREVIATION', 'Year'])
data.drop('ABBREVIATION', axis=1, inplace=True)
data.dropna(inplace=True)

# Ajout des années d'expérience
data['YEARS_EXP'] = data['Year']-data['year_start']


# get dummies to match the columns' names of shots
pct_action = pd.concat([pct_action, pd.get_dummies(pct_action.PREVIOUS, prefix="PREVIOUS")], axis=1)
pct_area = pd.concat([pct_area, pd.get_dummies(pct_area["Shot Zone"], prefix="Shot Zone", prefix_sep=" ")], axis=1)

# drop unneeded columns
pct_action.drop(['PREVIOUS','Total_Target',	'Count'], axis=1, inplace=True)
pct_area.drop(["Shot Zone",	"Total_Target",	"Count"], axis=1, inplace=True)

# rename PCT column
pct_action.rename({'Pourcentage':'PCT_PREV_ACTION'}, axis=1, inplace=True)
pct_area.rename({'Pourcentage':'PCT_AREA'}, axis=1, inplace=True)

# merge new columns with the shots dataframe
data = pd.merge(left=data, right=pct_action, how='left', on=["PLAYER1_NAME",	"Year",	"PREVIOUS_DEF_REBOUND",	"PREVIOUS_OFF_MISSED",	"PREVIOUS_OFF_REBOUND"])
data = pd.merge(left=data, right=pct_area, how='left', on=["PLAYER1_NAME", "Year", "Shot Zone Basic_Above the Break 3", 
                                                    "Shot Zone Basic_Backcourt", "Shot Zone Basic_In The Paint (Non-RA)", 
                                                    "Shot Zone Basic_Left Corner 3", "Shot Zone Basic_Mid-Range", 
                                                    "Shot Zone Basic_Restricted Area", "Shot Zone Basic_Right Corner 3"])


data = pd.concat([pd.get_dummies(data.DETAILLED_SHOT_TYPE, prefix="DETAILLED_SHOT_TYPE"), data], axis=1)
data.head()

Unnamed: 0,DETAILLED_SHOT_TYPE_FREE THROW,DETAILLED_SHOT_TYPE_JUMP SHOT,DETAILLED_SHOT_TYPE_OTHER,Shot Distance,X Location,Y Location,Away Team,Season Type,Year,Shot Zone Basic_Above the Break 3,Shot Zone Basic_Backcourt,Shot Zone Basic_In The Paint (Non-RA),Shot Zone Basic_Left Corner 3,Shot Zone Basic_Mid-Range,Shot Zone Basic_Restricted Area,Shot Zone Basic_Right Corner 3,Shot Zone Area_Back Court(BC),Shot Zone Area_Center(C),Shot Zone Area_Left Side Center(LC),Shot Zone Area_Left Side(L),Shot Zone Area_Right Side Center(RC),Shot Zone Area_Right Side(R),Shot Zone Range_16-24 ft.,Shot Zone Range_24+ ft.,Shot Zone Range_8-16 ft.,Shot Zone Range_Back Court Shot,Shot Zone Range_Less Than 8 ft.,GAME_ID,EVENTNUM,target,PERIOD,PLAYER1_NAME,PLAYER1_TEAM_ABBREVIATION,OPPONENT_TEAM,at_home,PREVIOUS_OFF_REBOUND,PREVIOUS_DEF_REBOUND,PREVIOUS_OFF_TURNOVER,PREVIOUS_OFF_MISSED,PREVIOUS_EVENTMSGTYPE,3PT,jump_shot,layup_shot,dunk_shot,hook_shot,free_throw,DETAILLED_SHOT_TYPE,minutes_left,seconds_left,Age,TS%,PTM,ASTM,ORBM,STLM,BLKM,TOVM,USG%,FG%,2P%,3P%,FT%,PTS,year_start,year_end,position,height,weight,birth_date,college,C,PF,PG,PG-SG,SF,SF-SG,SG,SG-PG,Team,E_DEF_RATING,E_OFF_RATING,YEARS_EXP,PCT_PREV_ACTION,PCT_AREA
0,False,True,False,19.0,-107.0,167.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001.0,11.0,0.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FREE_THROW,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,11.0,668.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI,95.5,100.7,3.0,,35.089974
1,False,True,False,21.0,-115.0,177.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,20000001.0,32.0,1.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7.0,477.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI,95.5,100.7,3.0,,35.089974
2,False,True,False,16.0,165.0,13.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,20000001.0,34.0,0.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FIELD_GOAL_MADE,0.0,1.0,0.0,0.0,0.0,0.0,JUMP SHOT,7.0,443.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI,95.5,100.7,3.0,,35.089974
3,True,False,False,15.0,0.0,150.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,20000001.0,39.0,1.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,False,FOUL,0.0,0.0,0.0,0.0,0.0,1.0,FREE THROW,7.0,432.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI,95.5,100.7,3.0,,35.089974
4,True,False,False,15.0,0.0,150.0,PHI,0.0,2000.0,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,20000001.0,40.0,1.0,1.0,Allen Iverson,PHI,NYK,0.0,False,False,False,True,FREE_THROW,0.0,0.0,0.0,0.0,0.0,1.0,FREE THROW,7.0,432.0,25.0,0.518,31.084507,4.577465,0.704225,2.507042,0.28169,3.338028,35.9,0.42,0.441,0.32,0.814,2207.0,1997.0,2010.0,G,182.88,165.0,"June 7, 1975",Georgetown University,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,PHI,95.5,100.7,3.0,76.724138,35.089974


In [49]:
# Remove unwanted columns
optuna_columns = ['Shot Distance',
                'Season Type',
                'Shot Zone Basic_In The Paint (Non-RA)',
                'Shot Zone Basic_Right Corner 3',
                'Shot Zone Area_Right Side(R)',
                'Shot Zone Range_8-16 ft.',
                'at_home',
                'PREVIOUS_OFF_MISSED',
                'Age',
                'ASTM',
                'ORBM',
                'FT%',
                'height',
                'weight',
                'C',
                'SG-PG',
                'E_DEF_RATING',
                'PCT_AREA',
                'DETAILLED_SHOT_TYPE_JUMP SHOT']

data[optuna_columns + ['target']]

# Save file
#data.to_csv("../data/processed/data.csv", index=False)

Unnamed: 0,Shot Distance,Season Type,Shot Zone Basic_In The Paint (Non-RA),Shot Zone Basic_Right Corner 3,Shot Zone Area_Right Side(R),Shot Zone Range_8-16 ft.,at_home,PREVIOUS_OFF_MISSED,Age,ASTM,ORBM,FT%,height,weight,C,SG-PG,E_DEF_RATING,PCT_AREA,DETAILLED_SHOT_TYPE_JUMP SHOT,target
0,19.0,0.0,False,False,False,False,0.0,False,25.0,4.577465,0.704225,0.814,182.880,165.0,0.0,0.0,95.5,35.089974,True,0.0
1,21.0,0.0,False,False,False,False,0.0,False,25.0,4.577465,0.704225,0.814,182.880,165.0,0.0,0.0,95.5,35.089974,True,1.0
2,16.0,0.0,False,False,True,False,0.0,False,25.0,4.577465,0.704225,0.814,182.880,165.0,0.0,0.0,95.5,35.089974,True,0.0
3,15.0,0.0,False,False,False,True,0.0,False,25.0,4.577465,0.704225,0.814,182.880,165.0,0.0,0.0,95.5,35.089974,False,1.0
4,15.0,0.0,False,False,False,True,0.0,True,25.0,4.577465,0.704225,0.814,182.880,165.0,0.0,0.0,95.5,35.089974,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11853,16.0,0.0,False,False,False,False,1.0,False,28.0,9.853659,1.585366,0.814,195.072,205.0,0.0,0.0,101.6,36.842105,True,1.0
11854,17.0,0.0,False,False,False,False,1.0,False,28.0,9.853659,1.585366,0.814,195.072,205.0,0.0,0.0,101.6,36.842105,True,1.0
11855,26.0,0.0,False,False,False,False,1.0,False,28.0,9.853659,1.585366,0.814,195.072,205.0,0.0,0.0,101.6,30.115830,True,0.0
11856,18.0,0.0,False,False,False,False,1.0,False,28.0,9.853659,1.585366,0.814,195.072,205.0,0.0,0.0,101.6,36.842105,True,0.0
