In [6]:
SEASON = '1983-84'

In [7]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [8]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
59,Adrian Dantley,dantlad01,UTA,SF,28,79,79,37.8,10.2,18.2,0.558,0.0,0.1,0.250,10.1,18.2,0.559,0.558,10.3,12.0,0.859,2.3,3.4,5.7,3.9,0.8,0.1,3.3,2.5,30.6
339,Al Wood,woodal01,SEA,SG,25,81,81,27.6,5.8,11.7,0.494,0.0,0.3,0.143,5.7,11.4,0.502,0.496,2.8,3.3,0.823,1.2,2.2,3.4,2.0,0.8,0.4,1.6,2.6,14.3
154,Albert King,kingal01,NJN,SF,24,79,53,26.6,5.9,12.0,0.492,0.0,0.3,0.136,5.8,11.7,0.500,0.493,2.9,3.7,0.786,1.6,3.3,4.9,2.6,1.2,0.4,2.6,3.3,14.7
84,Alex English,englial01,DEN,SF,30,82,77,35.0,11.1,20.9,0.529,0.0,0.1,0.143,11.0,20.8,0.531,0.529,4.3,5.2,0.824,2.6,3.0,5.7,5.0,1.0,1.2,2.7,3.1,26.4
169,Allen Leavell,leaveal01,HOU,PG,26,82,27,24.5,4.3,8.9,0.477,0.1,0.9,0.155,4.1,8.0,0.512,0.485,2.9,3.5,0.832,0.4,1.0,1.4,5.6,1.3,0.1,2.2,2.4,11.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,Wally Walker,walkewa01,HOU,SF,29,58,18,10.6,2.0,4.2,0.490,0.0,0.1,0.333,2.0,4.1,0.494,0.494,0.1,0.3,0.333,0.4,1.1,1.6,0.9,0.3,0.1,0.6,1.1,4.2
63,Walter Davis,daviswa03,PHO,SG,29,78,70,32.6,8.4,16.3,0.512,0.3,1.1,0.230,8.1,15.2,0.532,0.520,3.0,3.5,0.863,0.5,2.1,2.6,5.5,1.4,0.2,2.7,2.6,20.0
48,Wayne Cooper,coopewa01,POR,C,27,81,38,20.5,3.8,8.2,0.459,0.0,0.1,0.000,3.8,8.1,0.463,0.459,2.3,2.8,0.804,2.2,3.7,5.9,0.9,0.3,1.3,1.4,3.0,9.8
190,Wes Matthews,matthwe01,"ATL,PHI",PG,24,20,5,19.4,3.1,6.6,0.466,0.1,0.4,0.125,3.0,6.2,0.488,0.469,1.4,1.8,0.750,0.4,1.0,1.4,4.2,0.8,0.2,2.0,2.3,7.5


In [9]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
59,Adrian Dantley,dantlad01,UTA,SF,28,79,2984,24.6,0.652,0.003,0.658,6.7,9.0,7.9,16.2,0.9,0.1,12.4,28.2,13.0,1.6,14.6,0.235,5.8,-1.5,4.2,4.7
339,Al Wood,woodal01,SEA,SG,25,81,2236,14.9,0.545,0.022,0.287,4.8,9.2,7.0,11.2,1.4,0.8,10.6,22.1,2.7,1.7,4.4,0.094,-0.2,-0.7,-0.9,0.6
154,Albert King,kingal01,NJN,SF,24,79,2103,15.6,0.541,0.023,0.312,6.7,14.5,10.5,14.0,2.0,0.9,16.2,24.3,1.5,2.8,4.3,0.099,0.1,0.1,0.2,1.2
84,Alex English,englial01,DEN,SF,30,82,2870,22.2,0.570,0.004,0.249,7.7,9.3,8.5,20.9,1.3,1.7,10.5,27.9,7.2,1.0,8.2,0.136,4.7,-1.4,3.4,3.9
169,Allen Leavell,leaveal01,HOU,PG,26,82,2009,15.9,0.553,0.097,0.391,1.7,4.6,3.1,29.7,2.4,0.3,17.7,20.4,2.8,1.6,4.3,0.104,0.8,-0.2,0.6,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,Wally Walker,walkewa01,HOU,SF,29,58,612,10.9,0.490,0.025,0.075,4.6,11.6,8.1,12.0,1.3,0.4,11.7,18.1,0.1,0.5,0.5,0.041,-2.0,-0.5,-2.5,-0.1
63,Walter Davis,daviswa03,PHO,SG,29,78,2546,18.5,0.559,0.068,0.212,1.7,7.6,4.6,25.0,2.0,0.3,13.3,25.9,4.2,2.0,6.2,0.118,1.9,-0.8,1.0,1.9
48,Wayne Cooper,coopewa01,POR,C,27,81,1662,15.8,0.519,0.011,0.347,12.2,22.0,17.0,6.2,0.7,3.8,12.6,21.2,2.0,2.0,4.1,0.118,-1.0,-0.6,-1.6,0.2
190,Wes Matthews,matthwe01,"ATL,PHI",PG,24,20,388,12.3,0.511,0.061,0.275,2.1,5.7,4.0,31.4,2.0,0.5,21.4,20.1,0.0,0.4,0.4,0.052,-2.1,-0.6,-2.7,0.0


In [10]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
12,Atlanta Hawks,40,42,31,10,9,32,32,26,8,16,16,12,16,14,4,8,4,8,24,20,16,22,10,7,14,19,1,1,7,7,8,7,8,6,7,8,4,12,5,1
0,Boston Celtics,62,20,33,8,29,12,43,15,19,5,13,11,30,4,10,2,9,3,34,9,28,11,9,4,31,4,1,1,11,4,12,3,11,1,8,6,13,4,6,1
21,Chicago Bulls,27,55,18,23,9,32,18,40,9,15,8,20,10,20,3,9,6,6,16,24,11,31,5,12,10,27,1,0,4,9,7,6,4,10,6,10,4,12,1,8
20,Cleveland Cavaliers,28,54,23,18,5,36,20,38,8,16,9,19,11,19,3,9,5,7,12,30,16,24,5,8,10,30,0,2,5,10,4,10,4,8,9,5,4,13,2,6
9,Dallas Mavericks,43,39,31,10,12,29,7,15,36,24,2,8,5,7,19,11,17,13,25,19,18,20,13,12,16,11,1,1,10,3,5,10,9,5,7,9,7,7,4,4
13,Denver Nuggets,38,44,27,14,11,30,11,11,27,33,3,7,8,4,16,14,11,19,19,25,19,19,8,10,16,18,1,1,8,7,5,10,5,7,6,10,10,5,3,4
4,Detroit Pistons,49,33,30,11,19,22,34,24,15,9,13,16,21,8,7,5,8,4,23,19,26,14,11,9,25,9,1,1,7,8,8,5,7,6,11,4,10,7,5,2
15,Golden State Warriors,37,45,27,14,10,31,10,12,27,33,5,5,5,7,14,16,13,17,20,25,17,20,16,8,7,23,2,0,6,9,7,9,5,8,7,6,5,10,5,3
19,Houston Rockets,29,53,21,20,8,33,8,14,21,39,1,9,7,5,9,21,12,18,18,26,11,27,5,6,15,27,1,0,5,10,6,9,7,7,4,9,5,11,1,7
22,Indiana Pacers,26,56,20,21,6,35,17,41,9,15,5,23,12,18,4,8,5,7,12,29,14,27,7,11,10,25,0,2,4,10,3,9,6,8,5,11,7,9,1,7
