In [25]:
SEASON = '2003-04'

In [26]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
336,Aaron McKie,mckieaa01,PHI,SF,31,75,41,28.2,3.5,7.7,0.459,1.0,2.3,0.436,2.5,5.4,0.469,0.524,1.1,1.5,0.757,0.6,2.8,3.4,2.6,1.1,0.3,1.4,1.9,9.2
559,Aaron Williams,williaa01,NJN,C,32,72,7,18.6,2.4,4.8,0.503,0.0,0.0,0.333,2.4,4.7,0.504,0.504,1.5,2.2,0.677,1.4,2.7,4.1,1.1,0.5,0.6,1.2,2.6,6.3
177,Adonal Foyle,foylead01,GSW,C,28,44,8,13.0,1.3,3.0,0.454,0.0,0.0,,1.3,3.0,0.454,0.454,0.4,0.8,0.543,1.2,2.6,3.8,0.4,0.1,1.0,0.5,1.5,3.1
205,Adrian Griffin,griffad01,HOU,SG,29,19,1,7.0,0.3,0.9,0.278,0.1,0.1,0.500,0.2,0.8,0.250,0.306,0.0,0.2,0.000,0.1,0.9,1.0,0.5,0.4,0.1,0.2,0.9,0.6
219,Al Harrington,harrial01,IND,PF,23,79,15,30.9,5.3,11.5,0.463,0.3,1.0,0.273,5.1,10.5,0.481,0.475,2.3,3.2,0.734,2.1,4.4,6.4,1.7,1.0,0.3,2.1,3.2,13.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,Zendon Hamilton,hamilze01,PHI,C,28,46,0,10.3,1.1,2.1,0.537,0.0,0.0,,1.1,2.1,0.537,0.537,1.5,2.1,0.698,1.1,2.1,3.2,0.3,0.2,0.2,0.6,1.6,3.7
422,Zoran Planinić,planizo01,NJN,SG,21,49,1,9.7,1.1,2.6,0.411,0.2,0.7,0.281,0.9,2.0,0.454,0.446,0.8,1.2,0.633,0.3,0.8,1.1,1.4,0.3,0.1,0.7,1.4,3.1
247,Zydrunas Ilgauskas,ilgauzy01,CLE,C,28,81,81,31.3,5.8,11.9,0.483,0.0,0.1,0.286,5.7,11.8,0.484,0.484,3.7,5.0,0.746,3.4,4.6,8.1,1.3,0.5,2.5,2.0,3.4,15.3
92,Žarko Čabarkapa,cabarza01,PHO,PF,22,49,4,11.6,1.7,4.0,0.411,0.1,0.7,0.188,1.5,3.4,0.455,0.426,0.7,1.1,0.660,0.5,1.5,2.0,0.8,0.2,0.3,1.1,1.5,4.1


In [27]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
336,Aaron McKie,mckieaa01,PHI,SF,31,75,2112,13.9,0.550,0.298,0.192,2.5,11.5,7.1,16.8,2.2,0.9,14.1,16.2,2.5,2.3,4.8,0.108,0.2,1.1,1.3,1.8
559,Aaron Williams,williaa01,NJN,C,32,72,1337,13.2,0.549,0.009,0.453,8.9,17.0,13.0,10.4,1.4,2.7,17.8,17.5,1.1,2.3,3.4,0.122,-2.4,0.8,-1.6,0.1
177,Adonal Foyle,foylead01,GSW,C,28,44,572,13.0,0.471,0.000,0.269,10.6,22.3,16.4,4.8,0.6,5.8,12.6,13.4,0.3,0.8,1.1,0.093,-2.3,1.2,-1.1,0.1
205,Adrian Griffin,griffad01,HOU,SG,29,19,133,4.5,0.278,0.111,0.222,0.9,15.2,8.4,11.6,2.9,1.2,13.2,8.2,-0.2,0.2,0.1,0.031,-6.3,4.4,-1.9,0.0
219,Al Harrington,harrial01,IND,PF,23,79,2441,14.9,0.514,0.085,0.277,7.8,16.9,12.3,10.3,1.8,0.7,13.8,22.9,1.8,4.1,5.9,0.116,-0.9,0.2,-0.7,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,Zendon Hamilton,hamilze01,PHI,C,28,46,473,15.9,0.616,0.000,1.011,12.3,23.9,18.2,4.8,0.9,1.3,16.4,16.3,1.1,0.6,1.8,0.178,-1.0,-0.5,-1.5,0.1
422,Zoran Planinić,planizo01,NJN,SG,21,49,473,10.0,0.492,0.248,0.465,3.5,10.2,6.9,23.9,1.5,0.5,18.8,19.0,0.1,0.6,0.7,0.068,-3.3,-0.1,-3.4,-0.2
247,Zydrunas Ilgauskas,ilgauzy01,CLE,C,28,81,2539,20.2,0.541,0.007,0.421,12.1,16.3,14.2,7.8,0.8,5.6,12.5,23.1,5.0,2.8,7.8,0.147,1.1,-0.2,0.9,1.9
92,Žarko Čabarkapa,cabarza01,PHO,PF,22,49,570,8.4,0.461,0.162,0.269,5.2,14.7,9.9,11.6,0.9,1.7,19.7,21.7,-0.7,0.3,-0.4,-0.037,-4.2,-1.4,-5.7,-0.5


In [28]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
24,Atlanta Hawks,28,54,18,23,10,31,19,35,9,19,9,17,10,18,6,8,3,11,18,35,10,19,2,9,12,26,0,2,6,10,3,12,6,9,4,7,5,11,4,3
19,Boston Celtics,36,46,19,22,17,24,24,30,12,16,14,10,10,20,7,7,5,9,23,31,13,15,10,9,11,18,2,0,5,9,9,8,6,10,3,9,9,5,2,5
27,Chicago Bulls,23,59,14,27,9,32,19,35,4,24,8,18,11,17,1,13,3,11,15,38,8,21,2,5,8,32,1,1,3,11,5,10,4,13,4,7,4,12,2,5
20,Cleveland Cavaliers,35,47,23,18,12,29,29,25,6,22,15,11,14,14,2,12,4,10,20,33,15,14,8,4,11,25,0,2,4,11,6,9,7,7,7,6,8,7,3,5
6,Dallas Mavericks,52,30,36,5,16,25,19,11,33,19,10,4,9,7,14,10,19,9,33,19,19,11,8,7,30,14,1,1,10,5,7,6,11,6,9,2,8,8,6,2
10,Denver Nuggets,43,39,29,12,14,27,14,16,29,23,7,7,7,9,11,13,18,10,31,23,12,16,3,9,23,19,1,1,9,5,9,7,9,8,5,7,6,9,4,2
5,Detroit Pistons,54,28,31,10,23,18,37,17,17,11,20,6,17,11,8,6,9,5,33,21,21,7,6,12,30,4,1,1,11,5,7,7,13,3,6,8,10,3,6,1
17,Golden State Warriors,37,45,27,14,10,31,14,16,23,29,6,8,8,8,11,17,12,12,23,28,14,17,7,7,16,21,0,1,8,6,6,9,6,9,5,8,8,8,4,4
9,Houston Rockets,45,37,27,14,18,23,24,6,21,31,11,3,13,3,8,16,13,15,30,22,15,15,8,6,21,13,1,0,8,7,8,7,9,6,8,5,8,6,3,6
0,Indiana Pacers,61,21,34,7,27,14,41,13,20,8,21,5,20,8,10,4,10,4,39,14,22,7,7,6,30,8,1,1,13,2,9,7,12,3,8,3,12,4,6,1
