In [13]:
SEASON = '2015-16'

In [14]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
65,Aaron Brooks,brookaa01,CHI,PG,31,69,0,16.1,2.7,6.8,0.401,1.0,2.7,0.357,1.8,4.1,0.430,0.471,0.7,0.9,0.766,0.3,1.2,1.5,2.6,0.4,0.1,1.2,1.9,7.1
182,Aaron Gordon,gordoaa01,ORL,PF,20,78,37,23.9,3.5,7.4,0.473,0.5,1.8,0.296,3.0,5.6,0.531,0.509,1.7,2.5,0.668,2.0,4.5,6.5,1.6,0.8,0.7,0.8,2.0,9.2
216,Aaron Harrison,harriaa01,CHO,SG,21,21,0,4.4,0.2,0.9,0.263,0.1,0.5,0.300,0.1,0.4,0.222,0.342,0.2,0.6,0.417,0.2,0.5,0.7,0.1,0.3,0.0,0.2,0.5,0.9
422,Adreian Payne,paynead01,MIN,PF,24,52,2,9.3,1.0,2.8,0.366,0.2,0.6,0.281,0.8,2.2,0.389,0.397,0.3,0.5,0.654,0.4,1.8,2.1,0.6,0.3,0.2,0.7,1.5,2.5
246,Al Horford,horfoal01,ATL,C,29,82,82,32.1,6.5,12.8,0.505,1.1,3.1,0.344,5.4,9.7,0.557,0.547,1.3,1.6,0.798,1.8,5.5,7.3,3.2,0.8,1.5,1.3,2.0,15.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,Xavier Munford,munfoxa02,MEM,SG,23,14,0,17.4,2.3,5.5,0.416,0.6,1.6,0.391,1.6,3.9,0.426,0.474,0.5,1.0,0.500,0.2,2.0,2.2,1.6,0.9,0.2,1.1,1.8,5.7
311,Zach LaVine,lavinza01,MIN,SG,20,82,33,28.0,5.3,11.7,0.452,1.5,3.9,0.389,3.8,7.8,0.482,0.516,2.0,2.5,0.793,0.3,2.5,2.8,3.1,0.8,0.2,1.9,2.4,14.0
443,Zach Randolph,randoza01,MEM,PF,34,68,53,29.6,6.3,13.3,0.475,0.1,0.4,0.231,6.3,13.0,0.482,0.479,2.5,3.2,0.796,2.6,5.1,7.8,2.1,0.6,0.2,1.5,2.1,15.3
414,Zaza Pachulia,pachuza01,DAL,C,31,76,69,26.4,2.9,6.2,0.466,0.0,0.0,0.000,2.9,6.1,0.469,0.466,2.8,3.6,0.768,3.3,6.2,9.4,1.7,0.8,0.3,1.6,2.4,8.6


In [15]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
65,Aaron Brooks,brookaa01,CHI,PG,31,69,1108,11.8,0.494,0.394,0.136,2.0,7.5,4.8,26.0,1.4,0.7,14.2,22.9,0.2,0.7,0.9,0.040,-1.3,-1.3,-2.6,-0.2
182,Aaron Gordon,gordoaa01,ORL,PF,20,78,1863,17.0,0.541,0.245,0.333,9.0,21.3,15.1,10.3,1.6,2.4,9.0,17.3,3.2,2.2,5.4,0.139,0.9,0.1,1.0,1.4
216,Aaron Harrison,harriaa01,CHO,SG,21,21,93,4.3,0.371,0.526,0.632,4.7,13.1,8.8,3.0,3.2,0.0,14.1,13.7,-0.2,0.1,0.0,-0.014,-7.2,0.9,-6.3,-0.1
422,Adreian Payne,paynead01,MIN,PF,24,52,486,5.6,0.422,0.221,0.179,4.8,21.5,13.3,8.9,1.7,1.8,18.7,17.7,-0.9,0.4,-0.5,-0.047,-6.1,-0.2,-6.2,-0.5
246,Al Horford,horfoal01,ATL,C,29,82,2631,19.4,0.565,0.244,0.123,6.3,18.2,12.4,16.7,1.3,3.6,8.8,20.6,4.9,4.5,9.4,0.172,2.2,1.6,3.8,3.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,Xavier Munford,munfoxa02,MEM,SG,23,14,244,9.9,0.481,0.299,0.182,1.3,13.7,7.2,14.3,2.7,1.1,15.3,18.0,-0.2,0.2,0.1,0.017,-3.1,0.6,-2.5,0.0
311,Zach LaVine,lavinza01,MIN,SG,20,82,2294,14.3,0.548,0.330,0.212,1.4,10.1,5.8,18.6,1.5,0.6,12.9,23.5,1.9,0.7,2.6,0.054,0.2,-1.6,-1.5,0.3
443,Zach Randolph,randoza01,MEM,PF,34,68,2016,18.3,0.519,0.029,0.238,9.7,20.7,15.0,12.9,1.1,0.6,9.5,24.6,2.8,1.5,4.3,0.103,1.4,-1.7,-0.3,0.9
414,Zaza Pachulia,pachuza01,DAL,C,31,76,2004,16.2,0.550,0.006,0.587,13.7,25.7,19.7,9.7,1.6,0.9,16.8,16.2,3.4,2.7,6.0,0.145,-0.3,0.1,-0.2,0.9


In [16]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
6,Atlanta Hawks,48,34,27,14,21,20,29,23,19,11,11,7,10,8,8,8,6,4,5,5,8,2,31,24,17,10,5,7,29,17,2,1,10,7,9,5,6,9,6,5,12,4,3,3
7,Boston Celtics,48,34,28,13,20,21,31,21,17,13,10,6,9,9,12,6,6,4,7,3,4,6,32,23,16,11,5,7,26,13,1,1,9,7,8,6,9,8,9,3,7,7,5,2
27,Brooklyn Nets,21,61,14,27,7,34,12,40,9,21,6,10,4,14,2,16,4,6,3,7,2,8,14,40,7,21,7,4,6,34,0,3,4,10,5,10,3,13,5,7,4,11,0,7
8,Charlotte Hornets,48,34,30,11,18,23,33,19,15,15,13,5,12,6,8,8,4,6,5,5,6,4,27,26,21,8,6,5,25,17,0,2,10,5,7,7,6,11,7,3,13,3,5,3
15,Chicago Bulls,42,40,26,15,16,25,25,27,17,13,12,6,10,6,3,15,5,5,6,4,6,4,27,25,15,15,10,4,14,19,2,1,8,4,8,7,8,8,4,8,8,9,4,3
2,Cleveland Cavaliers,57,25,33,8,24,17,35,17,22,8,14,4,8,8,13,5,8,2,8,2,6,4,38,14,19,11,4,7,32,8,2,1,11,3,8,5,13,3,8,5,11,5,4,3
13,Dallas Mavericks,42,40,23,18,19,22,15,15,27,25,8,2,4,6,3,7,11,7,9,9,7,9,29,26,13,14,6,11,19,18,1,1,9,7,9,5,9,9,4,6,5,10,5,2
20,Denver Nuggets,33,49,18,23,15,26,15,15,18,34,6,4,5,5,4,6,4,12,7,11,7,11,22,32,11,17,10,5,11,21,1,1,5,11,6,9,6,9,5,7,9,8,1,4
11,Detroit Pistons,44,38,26,15,18,23,29,23,15,15,11,7,10,6,8,10,7,3,5,5,3,7,27,27,17,11,6,5,21,20,3,0,6,9,9,6,7,8,6,6,9,6,4,3
0,Golden State Warriors,73,9,39,2,34,7,27,3,46,6,9,1,8,2,10,0,15,3,15,1,16,2,48,4,25,5,7,2,44,5,3,0,16,0,11,2,14,2,9,1,15,2,5,2
