In [1]:
SEASON = '2013-14'

In [2]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
456,A.J. Price,priceaj01,MIN,SG,27,28,0,3.5,0.7,1.6,...,0.000,0.0,0.3,0.4,0.5,0.0,0.0,0.3,0.2,1.6
84,Aaron Brooks,brookaa01,"HOU,DEN",PG,29,72,12,21.6,3.2,8.1,...,0.874,0.6,1.3,1.9,3.2,0.7,0.2,1.6,2.0,9.0
234,Aaron Gray,grayaa01,"TOR,SAC",C,29,37,6,9.6,0.7,1.6,...,0.550,1.1,1.9,3.0,0.6,0.3,0.2,0.8,1.7,1.8
534,Adonis Thomas,thomaad01,"ORL,PHI",SF,20,6,1,6.2,1.0,2.3,...,1.000,0.0,0.5,0.5,0.5,0.0,0.0,0.2,1.0,2.3
255,Al Harrington,harrial01,WAS,PF,33,34,0,15.0,2.4,6.1,...,0.771,0.4,1.9,2.4,0.8,0.4,0.0,1.0,2.1,6.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,Wilson Chandler,chandwi01,DEN,SF,26,62,55,31.1,5.0,11.9,...,0.724,0.9,3.8,4.7,1.8,0.7,0.5,1.3,3.1,13.6
270,Xavier Henry,henryxa01,LAL,SF,22,43,5,21.1,3.3,8.0,...,0.655,0.6,2.1,2.7,1.2,1.0,0.2,1.3,1.8,10.0
463,Zach Randolph,randoza01,MEM,PF,32,79,79,34.2,7.1,15.2,...,0.742,3.4,6.7,10.1,2.5,0.7,0.3,2.3,2.7,17.4
438,Zaza Pachulia,pachuza01,MIL,C,29,53,43,25.0,2.8,6.6,...,0.846,2.7,3.6,6.3,2.6,0.8,0.3,1.7,2.3,7.7


In [3]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
456,A.J. Price,priceaj01,MIN,SG,27,28,99,9.7,0.469,0.478,...,13.0,23.2,-0.1,0.0,0.0,-0.012,-2.4,-2.4,-4.8,-0.1
84,Aaron Brooks,brookaa01,"HOU,DEN",PG,29,72,1557,12.5,0.518,0.427,...,15.8,20.5,1.3,0.9,2.2,0.067,-0.2,-0.8,-1.0,0.4
234,Aaron Gray,grayaa01,"TOR,SAC",C,29,37,355,7.8,0.466,0.016,...,30.8,12.5,-0.3,0.4,0.1,0.016,-4.9,0.2,-4.7,-0.2
534,Adonis Thomas,thomaad01,"ORL,PHI",SF,20,6,37,7.3,0.485,0.357,...,6.5,18.5,0.0,0.0,0.0,0.001,-4.2,-2.0,-6.2,0.0
255,Al Harrington,harrial01,WAS,PF,33,34,511,9.7,0.506,0.483,...,13.3,22.7,-0.2,0.5,0.4,0.035,-3.2,-1.0,-4.2,-0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,Wilson Chandler,chandwi01,DEN,SF,26,62,1927,12.4,0.526,0.476,...,8.9,19.5,1.8,1.2,3.1,0.076,-0.1,-0.6,-0.8,0.6
270,Xavier Henry,henryxa01,LAL,SF,22,43,908,12.3,0.511,0.235,...,11.9,23.1,-0.1,0.5,0.3,0.017,-2.4,-0.5,-2.8,-0.2
463,Zach Randolph,randoza01,MEM,PF,32,79,2705,18.3,0.510,0.017,...,12.0,26.1,2.9,3.5,6.4,0.113,0.8,-0.9,-0.1,1.3
438,Zaza Pachulia,pachuza01,MIL,C,29,53,1325,14.0,0.502,0.003,...,18.5,17.0,1.0,0.7,1.7,0.060,-0.9,-0.9,-1.8,0.1


In [4]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
17,Atlanta Hawks,38,44,24,17,14,27,28,24,10,20,10,8,10,8,8,8,4,6,4,6,2,8,25,26,13,18,8,9,15,16,0,1,9,8,9,5,6,7,2,10,6,10,6,3
25,Boston Celtics,25,57,16,25,9,32,21,31,4,26,5,11,6,12,10,8,3,7,1,9,0,10,19,35,6,22,5,9,9,23,0,1,7,11,6,6,2,15,5,6,3,12,2,6
13,Brooklyn Nets,44,38,28,13,16,25,26,26,18,12,9,7,7,11,10,8,5,5,6,4,7,3,24,27,20,11,8,3,17,20,0,1,5,11,5,9,10,3,7,5,12,4,5,5
15,Charlotte Bobcats,43,39,25,16,18,23,30,22,13,17,11,7,13,5,6,10,4,6,6,4,3,7,23,30,20,9,8,8,18,19,0,1,8,8,6,9,7,9,6,4,9,7,7,1
10,Chicago Bulls,48,34,27,14,21,20,35,17,13,17,12,6,11,5,12,6,3,7,6,4,4,6,27,25,21,9,6,5,27,19,1,1,6,7,5,10,11,4,9,4,10,6,6,2
21,Cleveland Cavaliers,33,49,19,22,14,27,21,31,12,18,8,10,7,9,6,12,6,4,5,5,1,9,20,33,13,16,7,5,13,27,1,0,4,12,5,9,6,9,8,6,6,9,3,4
9,Dallas Mavericks,49,33,26,15,23,18,20,10,29,23,5,5,9,1,6,4,10,8,10,8,9,7,32,22,17,11,11,11,23,10,1,0,9,8,8,5,9,8,9,3,8,6,5,3
19,Denver Nuggets,36,46,22,19,14,27,16,14,20,32,5,5,5,5,6,4,5,11,8,10,7,11,24,27,12,19,8,5,16,24,0,1,9,5,5,10,8,7,3,9,7,10,4,4
22,Detroit Pistons,29,53,17,24,12,29,23,29,6,24,11,7,6,10,6,12,2,8,3,7,1,9,22,30,7,23,2,6,16,25,1,0,5,10,8,9,4,8,5,8,4,12,2,6
7,Golden State Warriors,51,31,27,14,24,17,20,10,31,21,7,3,7,3,6,4,11,7,11,5,9,9,31,22,20,9,11,8,28,9,1,1,8,7,11,5,9,6,7,4,9,5,6,3
