In [9]:
SEASON = '2006-07'

In [10]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
308,Aaron McKie,mckieaa01,LAL,SF,34,10,0,13.1,1.1,1.7,0.647,0.0,0.0,,1.1,1.7,0.647,0.647,0.0,0.0,,0.1,1.7,1.8,1.3,0.4,0.0,0.8,1.4,2.2
494,Aaron Williams,williaa01,LAC,PF,35,38,7,9.8,0.8,1.4,0.547,0.0,0.0,0.000,0.8,1.4,0.558,0.547,0.5,0.6,0.818,0.7,1.5,2.2,0.2,0.2,0.4,0.5,1.5,2.0
327,Adam Morrison,morriad01,CHA,PF,22,78,23,29.8,4.6,12.1,0.376,1.1,3.3,0.337,3.4,8.8,0.391,0.422,1.5,2.2,0.710,0.7,2.3,2.9,2.1,0.4,0.1,1.7,2.3,11.8
147,Adonal Foyle,foylead01,GSW,C,31,48,6,9.9,1.0,1.8,0.565,0.0,0.0,,1.0,1.8,0.565,0.565,0.2,0.5,0.440,1.1,1.6,2.6,0.4,0.2,1.0,0.5,1.3,2.2
174,Adrian Griffin,griffad01,CHI,SF,32,54,1,10.8,1.1,2.4,0.473,0.0,0.1,0.000,1.1,2.3,0.496,0.473,0.3,0.4,0.789,0.7,1.3,2.0,1.1,0.6,0.1,0.7,1.4,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,Yaroslav Korolev,korolya01,LAC,SF,19,10,0,4.1,0.4,1.6,0.250,0.1,0.5,0.200,0.3,1.1,0.273,0.281,0.3,0.6,0.500,0.1,0.2,0.3,0.4,0.3,0.0,0.1,0.0,1.2
388,Zach Randolph,randoza01,POR,PF,25,68,67,35.7,8.8,18.9,0.467,0.2,0.7,0.292,8.6,18.2,0.473,0.472,5.8,7.1,0.819,2.9,7.2,10.1,2.2,0.8,0.2,3.2,2.7,23.6
355,Zaza Pachulia,pachuza01,ATL,C,22,72,47,28.1,4.0,8.5,0.474,0.0,0.0,0.000,4.0,8.5,0.475,0.474,4.1,5.2,0.786,2.8,4.2,6.9,1.5,1.1,0.5,2.3,3.7,12.2
216,Zydrunas Ilgauskas,ilgauzy01,CLE,C,31,78,78,27.3,4.9,10.2,0.485,0.0,0.0,0.000,4.9,10.2,0.486,0.485,2.0,2.5,0.807,3.1,4.6,7.7,1.6,0.6,1.3,1.8,3.3,11.9


In [11]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
308,Aaron McKie,mckieaa01,LAL,SF,34,10,131,7.5,0.647,0.000,0.000,0.9,15.1,8.0,14.4,1.6,0.0,32.0,8.6,0.0,0.1,0.1,0.039,-4.0,-0.1,-4.2,-0.1
494,Aaron Williams,williaa01,LAC,PF,35,38,374,9.4,0.606,0.019,0.415,8.7,18.4,13.6,2.9,1.0,2.9,22.3,10.0,0.2,0.4,0.7,0.089,-3.3,0.6,-2.7,-0.1
327,Adam Morrison,morriad01,CHA,PF,22,78,2326,7.9,0.450,0.273,0.179,2.7,9.2,5.9,11.9,0.6,0.2,11.3,22.4,-2.2,0.8,-1.5,-0.030,-2.7,-2.4,-5.1,-1.8
147,Adonal Foyle,foylead01,GSW,C,31,48,475,13.7,0.557,0.000,0.294,11.5,17.7,14.6,5.5,1.1,8.0,19.3,10.7,0.4,0.6,1.0,0.104,-2.3,1.6,-0.7,0.2
174,Adrian Griffin,griffad01,CHI,SF,32,54,585,10.7,0.499,0.047,0.147,7.9,13.4,10.6,15.2,2.8,0.5,21.7,13.3,0.1,1.2,1.3,0.103,-3.3,3.0,-0.3,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,Yaroslav Korolev,korolya01,LAC,SF,19,10,41,8.0,0.322,0.313,0.375,2.9,5.8,4.4,15.4,3.9,0.0,5.1,22.1,-0.1,0.0,0.0,-0.057,-3.6,-0.4,-4.1,0.0
388,Zach Randolph,randoza01,POR,PF,25,68,2425,22.8,0.537,0.037,0.374,10.0,25.6,17.6,12.8,1.2,0.5,12.6,33.1,3.8,1.9,5.7,0.112,3.4,-1.8,1.6,2.2
355,Zaza Pachulia,pachuza01,ATL,C,22,72,2026,16.8,0.562,0.002,0.607,11.7,18.1,14.9,9.2,2.0,1.4,17.2,21.3,2.7,2.1,4.8,0.114,-0.4,-0.4,-0.8,0.6
216,Zydrunas Ilgauskas,ilgauzy01,CLE,C,31,78,2130,18.0,0.527,0.001,0.242,12.9,20.0,16.4,10.2,1.2,3.6,13.8,21.7,2.6,4.0,6.6,0.148,-0.5,0.9,0.4,1.3


In [12]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
26,Atlanta Hawks,30,52,18,23,12,29,17,35,13,17,8,10,4,14,5,11,4,6,6,4,3,7,21,31,9,21,7,7,11,30,,,6,7,3,13,8,7,5,9,5,10,3,6
28,Boston Celtics,24,58,12,29,12,29,16,36,8,22,7,9,4,14,5,13,5,5,0,10,3,7,13,38,11,20,4,12,7,20,,,5,9,5,10,2,14,3,9,7,8,2,8
20,Charlotte Bobcats,33,49,20,21,13,28,24,28,9,21,7,11,8,10,9,7,3,7,4,6,2,8,19,33,14,16,2,4,15,23,,,4,11,5,10,8,7,5,8,6,9,5,4
7,Chicago Bulls,49,33,31,10,18,23,36,16,13,17,11,7,12,4,13,5,5,5,3,7,5,5,29,25,20,8,9,9,28,15,1.0,0.0,4,9,14,3,7,8,7,7,10,4,6,2
6,Cleveland Cavaliers,50,32,30,11,20,21,31,21,19,11,12,6,10,6,9,9,5,5,8,2,6,4,31,22,19,10,9,7,25,11,,,9,6,8,6,9,7,7,5,11,5,6,3
0,Dallas Mavericks,67,15,36,5,31,10,27,3,40,12,10,0,8,2,9,1,15,3,11,7,14,2,44,9,23,6,12,3,36,9,,,11,4,13,3,14,2,10,0,13,2,6,4
9,Denver Nuggets,45,37,23,18,22,19,18,12,27,25,5,5,7,3,6,4,9,7,11,7,7,11,26,25,19,12,3,7,23,17,,,8,5,8,7,6,9,6,7,7,8,10,1
3,Detroit Pistons,53,29,26,15,27,14,36,16,17,13,15,3,9,7,12,6,6,4,6,4,5,5,32,19,21,10,6,10,24,7,,,10,5,8,6,8,7,10,1,10,7,7,3
11,Golden State Warriors,42,40,30,11,12,29,14,16,28,24,7,3,4,6,3,7,10,8,6,10,12,6,25,29,17,11,8,8,26,21,,,9,7,7,9,5,9,5,8,8,6,8,1
4,Houston Rockets,52,30,28,13,24,17,24,6,28,24,8,2,7,3,9,1,10,8,10,8,8,8,33,19,19,11,4,4,25,13,,,10,5,9,7,10,4,6,6,12,4,5,4
