In [5]:
SEASON = '2017-18'

In [6]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
79,Aaron Brooks,brookaa01,MIN,PG,33,32,1,5.9,0.9,2.2,0.406,0.3,1.0,0.355,0.5,1.2,0.447,0.486,0.3,0.3,0.727,0.2,0.3,0.5,0.6,0.2,0.0,0.3,0.9,2.3
216,Aaron Gordon,gordoaa01,ORL,PF,22,58,57,32.9,6.5,14.9,0.434,2.0,5.9,0.336,4.5,9.0,0.497,0.500,2.7,3.9,0.698,1.5,6.4,7.9,2.3,1.0,0.8,1.8,1.9,17.6
244,Aaron Harrison,harriaa01,DAL,SG,23,9,3,25.9,2.1,7.7,0.275,1.0,4.8,0.209,1.1,2.9,0.385,0.341,1.4,1.9,0.765,0.4,2.2,2.7,1.2,1.0,0.2,0.3,3.0,6.7
297,Aaron Jackson,jacksaa01,HOU,PG,31,1,0,35.0,3.0,9.0,0.333,1.0,4.0,0.250,2.0,5.0,0.400,0.389,1.0,2.0,0.500,2.0,1.0,3.0,1.0,0.0,0.0,1.0,4.0,8.0
447,Abdel Nader,naderab01,BOS,SF,24,48,1,10.9,1.0,3.1,0.336,0.5,1.4,0.354,0.6,1.8,0.321,0.413,0.5,0.8,0.590,0.3,1.2,1.5,0.5,0.3,0.2,0.7,0.9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,Zach Randolph,randoza01,SAC,PF,36,59,57,25.6,6.1,12.9,0.473,0.9,2.5,0.347,5.3,10.4,0.503,0.507,1.4,1.8,0.785,1.6,5.1,6.7,2.2,0.7,0.2,2.0,2.0,14.5
477,Zaza Pachulia,pachuza01,GSW,C,33,69,57,14.1,2.2,3.8,0.564,0.0,0.0,0.000,2.2,3.8,0.567,0.564,1.1,1.3,0.806,1.3,3.4,4.7,1.6,0.6,0.2,1.0,1.8,5.4
514,Zhou Qi,qizh01,HOU,C,22,18,0,6.9,0.3,1.8,0.188,0.1,1.1,0.105,0.2,0.7,0.308,0.219,0.4,0.7,0.667,0.3,0.9,1.2,0.1,0.1,0.8,0.6,0.8,1.2
0,Álex Abrines,abrinal01,OKC,SG,24,75,8,15.1,1.5,3.9,0.395,1.1,2.9,0.380,0.4,0.9,0.443,0.540,0.5,0.6,0.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7


In [7]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
79,Aaron Brooks,brookaa01,MIN,PG,33,32,189,9.8,0.508,0.449,0.159,4.2,6.2,5.2,15.1,1.6,0.0,13.0,19.9,0.1,0.1,0.1,0.033,-2.6,-1.3,-3.8,-0.1
216,Aaron Gordon,gordoaa01,ORL,PF,22,58,1909,16.5,0.530,0.395,0.260,5.0,21.5,13.2,11.7,1.5,1.9,10.0,24.7,0.9,2.0,2.9,0.072,0.9,-0.6,0.3,1.1
244,Aaron Harrison,harriaa01,DAL,SG,23,9,233,5.1,0.392,0.623,0.246,1.8,9.9,5.7,6.7,1.9,0.8,3.8,15.5,-0.3,0.2,-0.1,-0.014,-6.8,-0.6,-7.5,-0.3
297,Aaron Jackson,jacksaa01,HOU,PG,31,1,35,2.4,0.405,0.444,0.222,6.5,3.2,4.8,4.0,0.0,0.0,9.2,13.7,0.0,0.0,0.0,-0.017,-6.6,-2.3,-8.9,-0.1
447,Abdel Nader,naderab01,BOS,SF,24,48,522,5.1,0.439,0.436,0.262,2.8,12.0,7.4,7.1,1.4,1.6,17.0,17.1,-0.9,0.8,-0.1,-0.014,-5.5,0.2,-5.3,-0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,Zach Randolph,randoza01,SAC,PF,36,59,1508,17.3,0.529,0.193,0.140,7.0,23.7,15.0,14.9,1.4,0.6,12.5,27.6,0.3,1.2,1.5,0.048,0.5,-1.5,-1.0,0.4
477,Zaza Pachulia,pachuza01,GSW,C,33,69,972,17.0,0.612,0.004,0.352,11.0,25.0,18.5,15.2,1.9,1.4,19.1,17.0,1.8,1.4,3.3,0.161,-0.7,0.7,0.0,0.5
514,Zhou Qi,qizh01,HOU,C,22,18,124,1.9,0.295,0.594,0.375,5.5,14.4,10.0,2.1,0.8,9.6,21.2,16.8,-0.5,0.2,-0.3,-0.104,-10.3,0.2,-10.1,-0.3
0,Álex Abrines,abrinal01,OKC,SG,24,75,1134,9.0,0.567,0.759,0.158,2.5,8.9,5.6,3.4,1.7,0.6,7.4,12.7,1.3,1.0,2.2,0.094,-1.9,0.4,-1.5,0.1


In [8]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
26,Atlanta Hawks,24,58,16,25,8,33,12,40,12,18,4,14,3,15,5,11,5,5,2,8,5,5,18,41,6,17,6,8,8,35,1,6,3,11,6,9,5,10,4,7,2,12,3,3
3,Boston Celtics,55,27,27,14,28,13,33,19,22,8,12,4,10,8,11,7,9,1,7,3,6,4,40,19,15,8,11,8,25,9,5,2,14,2,11,6,7,5,7,4,9,4,2,4
22,Brooklyn Nets,28,54,15,26,13,28,19,33,9,21,1,15,6,12,12,6,4,6,1,9,4,6,19,40,9,14,5,7,9,27,3,5,5,8,5,10,6,10,1,9,5,9,3,3
19,Charlotte Hornets,36,46,21,20,15,26,22,30,14,16,4,14,7,11,11,5,4,6,7,3,3,7,24,33,12,13,1,5,23,23,4,3,4,9,5,11,8,6,7,5,6,9,2,3
23,Chicago Bulls,27,55,17,24,10,31,21,31,6,24,5,13,4,12,12,6,2,8,0,10,4,6,20,37,7,18,9,5,6,30,1,4,2,13,10,6,5,10,2,8,5,10,2,4
5,Cleveland Cavaliers,50,32,29,12,21,20,35,17,15,15,11,7,11,5,13,5,5,5,5,5,5,5,34,22,16,10,8,5,17,20,3,4,12,3,9,5,6,8,6,4,10,6,4,2
27,Dallas Mavericks,24,58,15,26,9,32,10,20,14,38,2,8,4,6,4,6,5,13,4,14,5,11,18,40,6,18,2,9,11,20,1,7,4,10,8,8,3,11,3,7,4,10,1,5
13,Denver Nuggets,46,36,31,10,15,26,18,12,28,24,4,6,8,2,6,4,9,7,10,8,9,9,32,26,14,10,10,5,21,15,3,4,9,5,7,8,7,8,7,3,8,7,5,1
18,Detroit Pistons,39,43,25,16,14,27,24,28,15,15,7,11,9,7,8,10,4,6,7,3,4,6,28,29,11,14,4,7,19,23,5,3,9,3,6,9,3,11,6,6,7,8,3,3
2,Golden State Warriors,58,24,29,12,29,12,24,6,34,18,9,1,6,4,9,1,8,10,13,3,13,5,44,14,14,10,5,1,38,13,5,3,11,3,13,2,11,3,8,3,7,7,3,3
