In [26]:
SEASON = '1996-97'

In [27]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [28]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
194,A.C. Green,greenac01,"PHO,DAL",PF,33,83,73,30.0,2.8,5.8,0.483,0.0,0.2,0.050,2.8,5.6,0.502,0.485,1.5,2.4,0.650,2.7,5.2,7.9,0.8,0.8,0.2,0.9,1.7,7.2
332,Aaron McKie,mckieaa01,"POR,DET",SG,24,83,11,19.6,1.8,4.4,0.411,0.5,1.2,0.398,1.3,3.2,0.416,0.467,1.1,1.3,0.836,0.5,2.2,2.7,1.9,0.9,0.3,1.1,1.6,5.2
545,Aaron Williams,williaa01,"DEN,VAN",PF,25,33,1,17.1,2.6,4.5,0.574,0.0,0.0,0.000,2.6,4.5,0.578,0.574,1.0,1.5,0.673,1.9,2.5,4.3,0.5,0.5,0.9,1.0,2.2,6.2
141,Acie Earl,earlac01,"TOR,MIL",C,26,47,0,10.6,1.4,3.8,0.372,0.0,0.1,0.000,1.4,3.7,0.383,0.372,1.1,1.8,0.643,0.7,1.3,2.0,0.4,0.3,0.6,0.7,1.3,4.0
260,Adam Keefe,keefead01,UTA,C,26,62,0,14.8,1.3,2.6,0.513,0.0,0.0,0.000,1.3,2.6,0.516,0.513,1.1,1.7,0.689,1.2,2.3,3.5,0.5,0.5,0.2,0.7,1.6,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Willie Burton,burtowi01,ATL,SF,28,24,2,15.8,1.6,4.8,0.336,0.5,1.9,0.283,1.1,2.9,0.371,0.392,2.4,2.8,0.838,0.5,1.3,1.7,0.5,0.3,0.1,1.1,2.3,6.2
327,Xavier McDaniel,mcdanxa01,NJN,PF,33,62,5,18.9,2.2,5.7,0.389,0.1,0.4,0.200,2.1,5.3,0.403,0.396,1.0,1.4,0.730,2.0,3.1,5.1,1.0,0.6,0.3,1.1,2.3,5.6
116,Yinka Dare,dareyi01,NJN,C,24,41,2,7.6,0.5,1.3,0.352,0.0,0.0,,0.5,1.3,0.352,0.352,0.5,0.9,0.514,0.9,1.1,2.0,0.1,0.1,0.7,0.5,1.2,1.4
310,Šarūnas Marčiulionis,marcisa01,DEN,SG,32,17,0,15.0,2.2,5.9,0.376,0.6,1.8,0.367,1.6,4.2,0.380,0.431,1.7,2.1,0.806,0.7,1.1,1.8,1.5,0.7,0.1,2.4,2.2,6.8


In [29]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
194,A.C. Green,greenac01,"PHO,DAL",PF,33,83,2492,12.4,0.523,0.041,0.407,10.2,20.5,15.3,4.4,1.5,0.5,11.5,11.8,2.4,2.4,4.8,0.093,-1.0,-0.4,-1.4,0.4
332,Aaron McKie,mckieaa01,"POR,DET",SG,24,83,1625,12.1,0.524,0.282,0.301,3.1,13.2,8.3,15.4,2.6,1.1,17.9,14.7,1.0,2.4,3.5,0.103,-1.1,2.0,0.9,1.2
545,Aaron Williams,williaa01,"DEN,VAN",PF,25,33,563,16.4,0.599,0.007,0.331,12.6,16.9,14.7,4.7,1.5,3.9,15.9,16.6,0.7,0.4,1.1,0.098,-1.0,-0.6,-1.6,0.1
141,Acie Earl,earlac01,"TOR,MIL",C,26,47,500,10.7,0.433,0.028,0.467,8.0,14.7,11.3,6.7,1.6,4.5,13.9,22.7,-0.5,0.5,0.0,-0.004,-4.3,-0.2,-4.5,-0.3
260,Adam Keefe,keefead01,UTA,C,26,62,915,11.7,0.572,0.006,0.644,10.9,18.2,14.7,5.0,1.7,1.2,18.0,12.7,1.0,1.3,2.3,0.118,-1.3,0.3,-1.0,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Willie Burton,burtowi01,ATL,SF,28,24,380,8.6,0.507,0.397,0.586,3.5,9.2,6.4,4.8,1.2,0.6,15.1,21.3,0.1,0.4,0.5,0.060,-4.2,-0.5,-4.7,-0.3
327,Xavier McDaniel,mcdanxa01,NJN,PF,33,62,1170,10.8,0.439,0.070,0.251,10.8,19.1,14.7,8.7,1.6,1.1,15.1,16.9,-0.2,0.9,0.7,0.030,-3.3,-1.1,-4.3,-0.7
116,Yinka Dare,dareyi01,NJN,C,24,41,313,7.2,0.406,0.000,0.685,11.4,17.3,14.2,1.4,0.7,6.7,23.0,12.4,-0.4,0.3,-0.1,-0.014,-6.7,-0.3,-7.0,-0.4
310,Šarūnas Marčiulionis,marcisa01,DEN,SG,32,17,255,8.1,0.496,0.297,0.356,5.4,8.0,6.7,16.7,2.4,0.3,25.5,27.5,-0.6,0.1,-0.5,-0.085,-4.0,-0.8,-4.8,-0.2


In [30]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
6,Atlanta Hawks,56,26,36,5,20,21,34,20,22,6,17,9,17,11,11,3,11,3,31,15,25,11,5,3,35,14,9,7,7,4,14,2,8,5,12,4,6,4
27,Boston Celtics,15,67,11,30,4,37,8,46,7,21,1,23,7,23,3,11,4,10,11,35,4,32,3,12,6,35,4,10,2,11,4,11,1,13,2,14,2,8
8,Charlotte Hornets,54,28,30,11,24,17,34,20,20,8,20,6,14,14,11,3,9,5,29,19,25,9,8,5,19,12,8,6,8,7,10,6,10,3,9,4,9,2
0,Chicago Bulls,69,13,39,2,30,11,44,10,25,3,20,6,24,4,12,2,13,1,42,6,27,7,9,4,45,4,15,1,12,3,13,1,10,2,12,2,7,4
13,Cleveland Cavaliers,42,40,25,16,17,24,26,28,16,12,13,13,13,15,8,6,8,6,25,22,17,18,6,4,20,15,9,5,10,5,5,9,7,5,6,10,5,6
23,Dallas Mavericks,24,58,14,27,10,31,8,22,16,36,6,8,2,14,9,15,7,21,16,28,8,30,7,9,6,33,4,10,6,7,4,11,5,7,3,14,2,9
25,Denver Nuggets,21,61,12,29,9,32,8,22,13,39,4,10,4,12,7,17,6,22,16,32,5,29,8,12,8,27,5,11,3,11,6,9,3,10,3,10,1,10
9,Detroit Pistons,54,28,30,11,24,17,34,20,20,8,17,9,17,11,12,2,8,6,34,12,20,16,5,4,27,10,11,3,11,4,10,4,10,3,7,8,5,6
20,Golden State Warriors,30,52,18,23,12,29,11,19,19,33,6,8,5,11,15,13,4,20,17,29,13,23,1,4,16,32,4,11,7,7,6,9,4,7,4,12,5,6
3,Houston Rockets,57,25,30,11,27,14,17,13,40,12,9,5,8,8,19,5,21,7,32,16,25,9,6,3,27,13,15,1,9,5,8,7,5,7,11,3,9,2
