In [66]:
SEASON = '2021-22'

In [67]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
260,Aaron Gordon,gordoaa01,DEN,PF,26,75,75,31.7,5.8,11.1,0.520,1.2,3.5,0.335,4.6,7.7,0.605,0.573,2.3,3.1,0.743,1.7,4.2,5.9,2.5,0.6,0.6,1.8,2.0,15.0
303,Aaron Henry,henryaa01,PHI,SF,22,6,0,2.8,0.2,0.8,0.200,0.0,0.2,0.000,0.2,0.7,0.250,0.200,0.0,0.0,,0.0,0.2,0.2,0.0,0.0,0.3,0.3,0.3,0.3
321,Aaron Holiday,holidaa01,"WAS,PHO",PG,25,63,15,16.2,2.4,5.4,0.447,0.6,1.6,0.379,1.8,3.7,0.477,0.504,0.9,1.1,0.868,0.4,1.6,1.9,2.4,0.7,0.1,1.1,1.5,6.3
548,Aaron Nesmith,nesmiaa01,BOS,SF,22,52,3,11.0,1.4,3.5,0.396,0.6,2.2,0.270,0.8,1.3,0.612,0.481,0.4,0.5,0.808,0.3,1.4,1.7,0.4,0.4,0.1,0.6,1.3,3.8
781,Aaron Wiggins,wiggiaa01,OKC,SG,23,50,35,24.2,3.1,6.7,0.463,0.8,2.8,0.304,2.3,4.0,0.573,0.525,1.2,1.7,0.729,1.0,2.5,3.6,1.4,0.6,0.2,1.1,1.9,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,Zach LaVine,lavinza01,CHI,SF,26,67,67,34.7,8.4,17.7,0.476,2.8,7.1,0.389,5.7,10.6,0.534,0.554,4.8,5.6,0.853,0.3,4.3,4.6,4.5,0.6,0.3,2.6,1.8,24.4
673,Zavier Simpson,simpsza01,OKC,PG,24,4,4,43.5,4.8,13.0,0.365,0.3,2.0,0.125,4.5,11.0,0.409,0.375,1.3,1.3,1.000,0.5,4.8,5.3,7.5,1.3,1.0,2.5,3.8,11.0
553,Zeke Nnaji,nnajize01,DEN,PF,21,41,1,17.0,2.3,4.5,0.516,0.9,2.0,0.463,1.4,2.6,0.557,0.616,1.0,1.6,0.631,1.4,2.2,3.6,0.4,0.4,0.3,0.6,1.9,6.6
790,Ziaire Williams,willizi02,MEM,SF,20,62,31,21.7,3.1,6.8,0.450,1.2,3.9,0.314,1.9,2.9,0.632,0.540,0.7,0.9,0.782,0.4,1.7,2.1,1.0,0.6,0.2,0.7,1.8,8.1


In [68]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
260,Aaron Gordon,gordoaa01,DEN,PF,26,75,2376,15.3,0.602,0.312,0.276,6.1,14.3,10.3,11.6,0.9,1.7,12.5,19.7,3.2,2.0,5.2,0.105,0.5,-1.1,-0.6,0.9
303,Aaron Henry,henryaa01,PHI,SF,22,6,17,-7.4,0.200,0.200,0.000,0.0,6.5,3.3,0.0,0.0,10.6,28.6,18.5,-0.1,0.0,-0.1,-0.306,-14.0,-2.4,-16.5,-0.1
321,Aaron Holiday,holidaa01,"WAS,PHO",PG,25,63,1021,12.6,0.544,0.305,0.201,2.6,10.3,6.5,20.7,2.0,0.7,15.4,18.7,0.5,0.9,1.5,0.068,-1.9,0.3,-1.7,0.1
548,Aaron Nesmith,nesmiaa01,BOS,SF,22,52,574,7.3,0.507,0.632,0.143,2.9,13.6,8.4,5.4,1.7,0.8,13.8,17.2,-0.4,0.9,0.4,0.038,-4.9,0.7,-4.3,-0.3
781,Aaron Wiggins,wiggiaa01,OKC,SG,23,50,1209,10.3,0.556,0.409,0.252,4.3,11.0,7.6,8.5,1.2,0.8,12.6,15.3,0.5,0.8,1.2,0.048,-3.4,-0.9,-4.3,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,Zach LaVine,lavinza01,CHI,SF,26,67,2328,20.0,0.605,0.401,0.316,1.1,13.8,7.4,20.8,0.9,0.9,11.2,28.8,4.5,1.3,5.8,0.120,3.9,-1.5,2.4,2.6
673,Zavier Simpson,simpsza01,OKC,PG,24,4,174,7.7,0.406,0.154,0.096,1.2,11.4,6.2,25.2,1.4,2.1,15.6,15.9,-0.2,0.1,-0.1,-0.018,-6.0,-1.9,-7.9,-0.3
553,Zeke Nnaji,nnajize01,DEN,PF,21,41,698,13.5,0.629,0.430,0.349,9.3,14.1,11.8,3.4,1.1,1.7,9.7,14.9,1.2,0.6,1.8,0.123,-0.7,-1.6,-2.2,0.0
790,Ziaire Williams,willizi02,MEM,SF,20,62,1346,9.7,0.559,0.571,0.130,2.0,8.2,5.0,6.3,1.2,0.8,8.4,14.9,1.1,1.1,2.2,0.080,-2.4,-0.6,-3.0,-0.3


In [69]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
15,Atlanta Hawks,43,39,27,14,16,25,26,26,17,13,6,12,11,7,9,7,5,5,6,4,6,4,28,30,15,9,7,3,24,18,3,3,8,7,5,9,8,7,5,5,11,6,3,2
5,Boston Celtics,51,31,28,13,23,18,33,19,18,12,9,7,12,6,12,6,7,3,5,5,6,4,34,26,17,5,3,9,35,9,2,4,9,6,6,9,10,6,9,2,11,3,4,1
13,Brooklyn Nets,44,38,20,21,24,17,31,21,13,17,10,6,12,6,9,9,4,6,3,7,6,4,31,28,13,10,8,4,20,20,4,3,11,3,8,4,6,10,3,10,8,7,4,1
16,Charlotte Hornets,43,39,22,19,21,20,27,25,16,14,8,10,11,7,8,8,6,4,3,7,7,3,29,31,14,8,6,8,24,23,5,2,8,8,6,7,9,6,2,10,10,4,3,2
11,Chicago Bulls,46,36,27,14,19,22,29,23,17,13,8,10,10,6,11,7,8,2,5,5,4,6,38,21,8,15,4,4,23,20,5,1,9,7,9,2,8,8,8,5,6,9,1,4
14,Cleveland Cavaliers,44,38,25,16,19,22,27,25,17,13,8,10,10,6,9,9,8,2,4,6,5,5,35,23,9,15,10,4,22,19,3,4,8,6,9,6,11,4,5,5,6,10,2,3
4,Dallas Mavericks,52,30,29,12,23,18,16,14,36,16,6,4,6,4,4,6,12,6,10,8,14,2,35,24,17,6,7,5,27,14,4,2,6,7,7,9,12,4,7,3,12,4,4,1
9,Denver Nuggets,48,34,23,18,25,16,19,11,29,23,6,4,5,5,8,2,6,10,13,5,10,8,33,25,15,9,8,3,23,20,4,2,6,8,7,6,11,5,8,4,10,6,2,3
27,Detroit Pistons,23,59,13,28,10,31,18,34,5,25,5,13,6,10,7,11,2,8,1,9,2,8,13,45,10,14,7,6,6,33,1,5,3,12,1,11,7,9,3,9,6,10,2,3
2,Golden State Warriors,53,29,31,10,22,19,20,10,33,19,6,4,8,2,6,4,12,6,12,4,9,9,42,17,11,12,6,5,34,10,5,1,13,2,9,4,11,6,5,5,5,11,5,0
