In [9]:
SEASON = '2018-19'

In [10]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
249,Aaron Gordon,gordoaa01,ORL,PF,23,78,78,33.8,6.0,13.4,0.449,1.6,4.4,0.349,4.5,9.0,0.499,0.507,2.4,3.2,0.731,1.7,5.7,7.4,3.7,0.7,0.7,2.1,2.2,16.0
300,Aaron Holiday,holidaa01,IND,PG,22,50,0,12.9,2.1,5.2,0.401,0.9,2.5,0.339,1.2,2.7,0.459,0.483,0.8,1.0,0.820,0.1,1.2,1.3,1.7,0.4,0.3,0.8,1.4,5.9
497,Abdel Nader,naderab01,OKC,SF,25,61,1,11.4,1.5,3.5,0.423,0.5,1.6,0.320,1.0,1.9,0.513,0.498,0.4,0.6,0.750,0.2,1.7,1.9,0.3,0.3,0.2,0.4,1.1,4.0
311,Al Horford,horfoal01,BOS,C,32,68,68,29.0,5.7,10.6,0.535,1.1,3.0,0.360,4.6,7.6,0.604,0.586,1.1,1.4,0.821,1.8,5.0,6.7,4.2,0.9,1.3,1.5,1.9,13.6
12,Al-Farouq Aminu,aminual01,POR,PF,28,81,81,28.3,3.2,7.3,0.433,1.2,3.5,0.343,2.0,3.9,0.514,0.514,1.9,2.1,0.867,1.4,6.1,7.5,1.3,0.8,0.4,0.9,1.8,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,Zaza Pachulia,pachuza01,DET,C,34,68,3,12.9,1.3,2.8,0.440,0.0,0.1,0.000,1.3,2.8,0.450,0.440,1.4,1.8,0.782,1.5,2.4,3.9,1.3,0.5,0.3,0.8,2.2,3.9
615,Zhaire Smith,smithzh01,PHI,SG,19,6,2,18.5,2.3,5.7,0.412,1.0,2.7,0.375,1.3,3.0,0.444,0.500,1.0,1.3,0.750,0.5,1.7,2.2,1.7,0.3,0.3,1.0,1.3,6.7
552,Zhou Qi,qizh01,HOU,PF,23,1,0,1.0,1.0,1.0,1.000,0.0,0.0,,1.0,1.0,1.000,1.000,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
0,Álex Abrines,abrinal01,OKC,SG,25,31,2,19.0,1.8,5.1,0.357,1.3,4.1,0.323,0.5,1.0,0.500,0.487,0.4,0.4,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3


In [11]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
249,Aaron Gordon,gordoaa01,ORL,PF,23,78,2633,15.1,0.538,0.332,0.242,5.2,18.4,11.7,16.6,1.1,1.8,12.3,21.8,1.8,3.3,5.1,0.093,0.3,0.3,0.6,1.7
300,Aaron Holiday,holidaa01,IND,PG,22,50,646,11.9,0.518,0.485,0.191,0.9,10.4,5.8,19.3,1.6,1.8,12.3,21.9,0.1,0.8,0.9,0.065,-1.7,0.1,-1.6,0.1
497,Abdel Nader,naderab01,OKC,SF,25,61,694,8.8,0.522,0.465,0.167,2.0,15.7,8.6,3.8,1.3,1.5,10.1,15.1,0.0,0.9,0.9,0.062,-3.6,0.3,-3.2,-0.2
311,Al Horford,horfoal01,BOS,C,32,68,1973,20.2,0.605,0.281,0.131,6.5,18.3,12.4,21.2,1.4,3.9,11.8,18.9,4.5,2.9,7.5,0.181,3.3,1.9,5.1,3.6
12,Al-Farouq Aminu,aminual01,POR,PF,28,81,2292,13.2,0.568,0.472,0.292,5.3,22.6,14.2,6.0,1.4,1.2,9.7,13.7,3.0,2.8,5.8,0.121,0.1,0.6,0.7,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,Zaza Pachulia,pachuza01,DET,C,34,68,878,13.7,0.539,0.021,0.642,11.9,21.5,16.5,14.7,1.7,1.6,18.7,15.0,1.1,1.3,2.3,0.128,-2.8,1.6,-1.2,0.2
615,Zhaire Smith,smithzh01,PHI,SG,19,6,111,9.5,0.533,0.471,0.235,2.9,9.3,6.2,12.3,0.9,1.4,13.8,16.4,0.0,0.1,0.1,0.048,-2.8,-2.2,-5.0,-0.1
552,Zhou Qi,qizh01,HOU,PF,23,1,1,80.4,1.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.4,0.0,0.0,0.0,1.261,40.1,11.9,52.0,0.0
0,Álex Abrines,abrinal01,OKC,SG,25,31,588,6.3,0.507,0.809,0.083,0.9,7.8,4.2,4.3,1.3,0.9,7.9,12.2,0.1,0.6,0.6,0.053,-3.7,0.4,-3.3,-0.2


In [12]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
25,Atlanta Hawks,29,53,17,24,12,29,16,36,13,17,4,14,6,12,6,10,5,5,4,6,4,6,19,39,10,14,9,6,7,31,2,5,3,13,6,7,5,9,5,7,7,8,1,4
8,Boston Celtics,49,33,28,13,21,20,35,17,14,16,10,6,13,5,12,6,4,6,5,5,5,5,37,21,12,12,5,6,24,12,5,2,7,8,9,5,11,4,5,6,8,7,4,1
13,Brooklyn Nets,42,40,23,18,19,22,29,23,13,17,8,8,10,8,11,7,2,8,6,4,5,5,30,29,12,11,12,8,16,22,3,5,5,10,9,6,11,4,4,6,7,7,3,2
16,Charlotte Hornets,39,43,25,16,14,27,29,23,10,20,8,10,11,7,10,6,2,8,3,7,5,5,27,30,12,13,6,10,21,20,4,4,7,7,7,7,6,8,4,7,7,8,4,2
26,Chicago Bulls,22,60,9,32,13,28,16,36,6,24,4,14,3,13,9,9,1,9,2,8,3,7,14,44,8,16,8,8,9,31,2,6,3,12,5,9,2,13,5,5,4,11,1,4
27,Cleveland Cavaliers,19,63,13,28,6,35,15,37,4,26,6,12,4,12,5,13,0,10,2,8,2,8,12,46,7,17,5,4,6,43,1,6,3,11,4,12,3,12,4,6,4,11,0,5
21,Dallas Mavericks,33,49,24,17,9,32,15,15,18,34,4,6,6,4,5,5,8,10,6,12,4,12,26,31,7,18,7,7,12,22,2,6,8,4,7,9,6,9,4,6,3,12,3,3
3,Denver Nuggets,54,28,34,7,20,21,20,10,34,18,7,3,6,4,7,3,12,4,12,6,10,8,39,18,15,10,13,3,23,11,6,1,9,6,8,4,12,4,7,4,9,6,3,3
15,Detroit Pistons,41,41,26,15,15,26,27,25,14,16,10,8,8,8,9,9,4,6,5,5,5,5,26,30,15,11,6,9,16,21,4,3,8,4,4,11,6,10,7,3,10,6,2,4
2,Golden State Warriors,57,25,30,11,27,14,22,8,35,17,6,4,8,2,8,2,12,6,13,3,10,8,41,16,16,9,7,7,34,10,8,1,7,7,10,5,11,2,7,4,9,5,5,1
