In [1]:
SEASON = '2012-13'

In [2]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
421,A.J. Price,priceaj01,WAS,PG,26,57,22,22.4,2.8,7.2,...,0.790,0.4,1.6,2.0,3.6,0.6,0.1,1.1,1.3,7.7
72,Aaron Brooks,brookaa01,"SAC,HOU",PG,28,53,20,18.8,2.7,6.0,...,0.769,0.2,1.3,1.5,2.2,0.6,0.2,1.3,1.8,7.1
194,Aaron Gray,grayaa01,TOR,C,28,42,16,12.2,1.1,2.1,...,0.523,1.1,2.0,3.2,0.8,0.2,0.1,0.9,2.0,2.8
211,Al Harrington,harrial01,ORL,C,32,10,0,11.9,2.0,5.7,...,0.750,0.6,2.1,2.7,1.0,0.4,0.1,0.7,1.8,5.1
240,Al Horford,horfoal01,ATL,C,26,74,74,37.2,7.8,14.3,...,0.644,2.6,7.6,10.2,3.2,1.1,1.1,2.0,2.2,17.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Wilson Chandler,chandwi01,DEN,PF,25,43,8,25.1,4.9,10.6,...,0.793,1.0,4.0,5.1,1.3,1.0,0.3,1.4,2.8,13.0
226,Xavier Henry,henryxa01,NOH,SF,21,50,2,12.5,1.3,3.2,...,0.630,0.4,1.4,1.8,0.3,0.3,0.1,0.6,1.4,3.9
432,Zach Randolph,randoza01,MEM,PF,31,76,75,34.3,6.2,13.5,...,0.750,4.1,7.2,11.2,1.4,0.8,0.4,2.0,2.4,15.4
396,Zaza Pachulia,pachuza01,ATL,C,28,52,15,21.8,2.2,4.6,...,0.757,2.5,3.9,6.5,1.5,0.7,0.2,1.3,2.5,5.9


In [3]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
421,A.J. Price,priceaj01,WAS,PG,26,57,1278,12.4,0.501,0.484,...,12.7,18.0,1.0,1.2,2.2,0.084,-0.2,-0.4,-0.6,0.5
72,Aaron Brooks,brookaa01,"SAC,HOU",PG,28,53,997,11.8,0.555,0.424,...,16.5,18.1,0.8,0.0,0.8,0.041,-0.8,-1.4,-2.2,0.0
194,Aaron Gray,grayaa01,TOR,C,28,42,513,9.0,0.544,0.000,...,24.8,13.1,0.2,0.4,0.5,0.050,-3.9,-0.4,-4.3,-0.3
211,Al Harrington,harrial01,ORL,C,32,10,119,10.3,0.434,0.526,...,10.6,25.1,-0.1,0.1,0.0,-0.012,-3.0,-1.5,-4.5,-0.1
240,Al Horford,horfoal01,ATL,C,26,74,2756,19.8,0.560,0.006,...,11.3,21.8,4.7,4.1,8.8,0.153,1.3,0.6,1.9,2.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Wilson Chandler,chandwi01,DEN,PF,25,43,1079,16.6,0.556,0.277,...,10.6,22.7,1.8,1.3,3.1,0.137,0.7,-0.1,0.5,0.7
226,Xavier Henry,henryxa01,NOH,SF,21,50,625,7.6,0.481,0.068,...,13.3,17.3,-0.2,0.2,0.0,0.002,-4.6,-1.2,-5.7,-0.6
432,Zach Randolph,randoza01,MEM,PF,31,76,2607,17.9,0.506,0.022,...,11.5,23.1,3.2,4.7,7.9,0.145,1.0,-0.7,0.2,1.5
396,Zaza Pachulia,pachuza01,ATL,C,28,52,1134,13.1,0.534,0.012,...,19.6,14.6,1.2,1.5,2.7,0.116,-2.0,0.2,-1.7,0.1


In [4]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
13,Atlanta Hawks,44,38,25,16,19,22,29,23,15,15,7,11,11,7,11,5,6,4,5,5,4,6,29,22,15,16,5,5,19,20,,,9,5,10,5,7,9,7,4,8,10,3,5
15,Boston Celtics,41,40,27,13,14,27,27,24,14,16,7,9,8,9,12,6,6,4,6,4,2,8,28,24,13,16,8,7,18,23,0.0,1.0,9,6,5,9,8,7,8,4,8,8,3,5
7,Brooklyn Nets,49,33,26,15,23,18,36,16,13,17,11,5,13,5,12,6,5,5,5,5,3,7,31,22,18,11,9,4,23,17,,,11,4,5,11,11,4,7,5,8,7,7,2
28,Charlotte Bobcats,21,61,15,26,6,35,18,34,3,27,6,12,6,12,6,10,2,8,0,10,1,9,12,40,9,21,6,6,6,37,,,7,8,1,15,3,11,2,10,4,12,4,5
10,Chicago Bulls,45,37,24,17,21,20,34,18,11,19,13,5,9,7,12,6,4,6,5,5,2,8,30,22,15,15,11,7,16,16,1.0,0.0,6,7,9,6,12,4,5,8,7,7,5,5
27,Cleveland Cavaliers,24,58,14,27,10,31,18,34,6,24,5,13,3,13,10,8,3,7,2,8,1,9,16,37,8,21,6,11,7,27,1.0,0.0,3,12,3,13,6,8,7,5,2,12,2,8
16,Dallas Mavericks,41,41,24,17,17,24,17,13,24,28,5,5,6,4,6,4,7,11,10,8,7,9,23,29,18,12,5,8,17,19,1.0,1.0,6,8,5,10,7,8,6,5,11,5,5,4
3,Denver Nuggets,57,25,38,3,19,22,19,11,38,14,5,5,10,0,4,6,11,5,14,4,13,5,33,21,24,4,11,7,28,8,0.0,1.0,8,8,9,6,12,3,8,4,13,2,7,1
22,Detroit Pistons,29,53,18,23,11,30,25,27,4,26,6,12,8,8,11,7,1,9,2,8,1,9,21,33,8,20,6,10,15,29,0.0,1.0,5,11,6,10,6,7,6,8,1,13,5,3
9,Golden State Warriors,47,35,28,13,19,22,19,11,28,24,7,3,5,5,7,3,10,8,9,7,9,9,30,22,17,13,5,3,20,18,1.0,0.0,8,6,12,4,8,7,4,8,9,7,5,3
