In [6]:
SEASON = '2000-01'

In [7]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [8]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
181,A.C. Green,greenac01,MIA,PF,37,82,1,17.2,1.8,4.0,0.444,0.0,0.1,0.000,1.8,3.9,0.453,0.444,1.0,1.4,0.712,1.3,2.5,3.8,0.5,0.4,0.1,0.5,1.5,4.5
184,A.J. Guyton,guytoaj01,CHI,PG,22,33,8,19.1,2.4,5.8,0.406,0.8,2.1,0.391,1.5,3.7,0.415,0.477,0.5,0.5,0.833,0.3,0.8,1.1,1.9,0.3,0.2,0.7,1.1,6.0
309,Aaron McKie,mckieaa01,PHI,SG,28,76,33,31.5,4.4,9.4,0.473,0.7,2.2,0.312,3.8,7.2,0.524,0.511,2.0,2.6,0.768,0.4,3.7,4.1,5.0,1.4,0.1,2.7,2.3,11.6
515,Aaron Williams,williaa01,NJN,PF,29,82,25,28.5,3.6,7.9,0.457,0.0,0.0,0.000,3.6,7.9,0.458,0.457,3.0,3.8,0.787,2.6,4.6,7.2,1.1,0.7,1.4,1.6,3.9,10.2
247,Adam Keefe,keefead01,GSW,PF,30,67,13,12.5,1.0,2.4,0.403,0.0,0.0,0.333,0.9,2.3,0.404,0.406,0.6,0.9,0.619,1.3,1.8,3.1,0.5,0.4,0.3,0.6,1.5,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Will Perdue,perduwi01,POR,C,35,13,0,4.5,0.5,0.7,0.667,0.0,0.0,,0.5,0.7,0.667,0.667,0.2,0.3,0.500,0.5,0.9,1.4,0.2,0.2,0.2,0.0,0.6,1.1
19,William Avery,averywi01,MIN,PG,21,55,0,8.4,1.0,2.6,0.382,0.3,1.1,0.271,0.7,1.5,0.459,0.438,0.5,0.7,0.778,0.1,0.4,0.5,1.4,0.2,0.1,0.8,0.9,2.8
187,Zendon Hamilton,hamilze01,LAC,C,25,3,0,6.3,0.7,3.0,0.222,0.0,0.0,,0.7,3.0,0.222,0.222,1.7,2.7,0.625,1.0,1.7,2.7,0.0,0.0,0.0,0.7,1.3,3.0
218,Zydrunas Ilgauskas,ilgauzy01,CLE,C,25,24,24,25.7,4.8,9.8,0.487,0.0,0.1,0.000,4.8,9.7,0.491,0.487,2.2,3.3,0.679,2.7,4.0,6.7,0.8,0.6,1.5,2.5,3.3,11.7


In [9]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
181,A.C. Green,greenac01,MIA,PF,37,82,1411,11.2,0.492,0.019,0.343,9.0,17.3,13.2,4.8,1.2,0.4,10.8,14.4,1.1,2.1,3.2,0.110,-2.2,0.0,-2.2,-0.1
184,A.J. Guyton,guytoaj01,CHI,PG,22,33,630,10.3,0.495,0.359,0.094,1.8,5.1,3.4,18.1,0.8,0.6,10.7,16.5,0.4,-0.1,0.3,0.020,-0.8,-3.2,-4.0,-0.3
309,Aaron McKie,mckieaa01,PHI,SG,28,76,2394,15.1,0.549,0.238,0.272,1.6,12.9,7.3,26.7,2.3,0.2,20.3,18.9,2.3,3.5,5.9,0.118,0.1,1.3,1.4,2.1
515,Aaron Williams,williaa01,NJN,PF,29,82,2336,15.1,0.533,0.003,0.477,10.1,19.2,14.5,6.5,1.3,3.6,14.4,17.8,2.9,2.3,5.2,0.107,-1.3,-0.3,-1.6,0.2
247,Adam Keefe,keefead01,GSW,PF,30,67,836,9.7,0.450,0.019,0.396,10.5,16.4,13.3,6.5,1.7,1.7,17.6,11.5,0.1,0.5,0.6,0.032,-3.5,-0.7,-4.3,-0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Will Perdue,perduwi01,POR,C,35,13,58,18.5,0.651,0.000,0.444,12.8,23.9,18.5,5.3,2.8,2.6,0.0,8.6,0.2,0.1,0.3,0.245,1.6,1.7,3.3,0.1
19,William Avery,averywi01,MIN,PG,21,55,463,9.0,0.482,0.410,0.250,1.2,6.0,3.6,24.0,1.5,0.6,22.0,20.0,-0.2,0.2,0.0,-0.003,-2.6,-2.0,-4.6,-0.3
187,Zendon Hamilton,hamilze01,LAC,C,25,3,19,5.7,0.359,0.000,0.889,18.6,29.3,24.1,0.0,0.0,0.0,13.8,35.3,-0.1,0.0,0.0,-0.119,-8.9,-6.1,-15.0,-0.1
218,Zydrunas Ilgauskas,ilgauzy01,CLE,C,25,24,616,16.0,0.524,0.009,0.333,12.3,17.8,15.0,5.4,1.3,4.4,18.3,24.1,0.1,0.7,0.7,0.056,-1.4,-0.4,-1.8,0.0


In [10]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
24,Atlanta Hawks,25,57,18,23,7,34,16,38,9,19,7,19,9,19,2,12,7,7,16,33,9,24,5,4,10,25,0.0,1.0,3,12,7,7,6,8,2,12,5,11,2,6
19,Boston Celtics,36,46,20,21,16,25,24,30,12,16,11,13,13,17,5,9,7,7,22,27,14,19,10,7,11,20,,,7,8,5,10,7,8,6,6,8,7,3,7
13,Charlotte Hornets,46,36,28,13,18,23,37,17,9,19,17,9,20,8,4,10,5,9,26,25,20,11,7,8,23,11,1.0,0.0,9,6,10,5,5,10,7,5,8,6,6,4
28,Chicago Bulls,15,67,10,31,5,36,13,41,2,26,9,17,4,24,2,12,0,14,6,42,9,25,3,4,3,37,0.0,1.0,2,12,3,12,1,13,3,9,3,13,3,7
22,Cleveland Cavaliers,30,52,20,21,10,31,22,32,8,20,11,15,11,17,4,10,4,10,20,27,10,25,6,5,10,27,1.0,0.0,8,5,7,7,4,11,2,10,4,14,4,5
4,Dallas Mavericks,53,29,28,13,25,16,23,7,30,22,11,3,12,4,14,10,16,12,31,19,22,10,4,5,28,14,1.0,0.0,9,6,10,6,8,6,8,4,10,4,7,3
18,Denver Nuggets,40,42,29,12,11,30,13,17,27,25,4,10,9,7,13,11,14,14,27,24,13,18,12,6,14,25,,,8,8,8,7,10,5,4,9,5,10,5,3
20,Detroit Pistons,32,50,18,23,14,27,25,29,7,21,9,17,16,12,3,11,4,10,19,31,13,19,2,3,16,17,1.0,0.0,5,9,7,9,4,11,4,7,5,10,6,4
27,Golden State Warriors,17,65,11,30,6,35,8,22,9,43,3,11,5,11,5,23,4,20,15,33,2,32,6,5,6,40,1.0,0.0,4,11,5,9,4,11,2,11,1,13,0,10
14,Houston Rockets,45,37,24,17,21,20,25,5,20,32,9,5,16,0,11,13,9,19,25,25,20,12,4,10,25,14,0.0,1.0,9,6,5,8,8,8,9,4,9,6,5,4
