In [66]:
SEASON = '1988-89'

In [67]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [68]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
134,A.C. Green,greenac01,LAL,PF,25,82,82,30.6,4.9,9.2,0.529,0.0,0.2,0.235,4.8,9.0,0.536,0.532,3.4,4.4,0.786,3.1,5.9,9.0,1.3,1.1,0.7,1.5,2.1,13.3
39,Adrian Branch,brancad01,POR,SF,25,67,4,12.1,3.0,6.5,0.463,0.1,0.5,0.226,2.9,6.0,0.481,0.471,1.3,1.8,0.725,0.9,1.0,2.0,0.9,0.7,0.0,1.0,1.5,7.4
78,Adrian Dantley,dantlad01,"DET,DAL",SF,33,73,67,33.2,6.4,13.1,0.493,0.0,0.0,0.000,6.4,13.1,0.493,0.493,6.3,7.8,0.810,1.6,2.7,4.3,2.3,0.6,0.2,2.2,2.5,19.2
194,Albert King,kingal01,SAS,SF,29,46,11,17.2,3.1,7.1,0.431,0.2,0.7,0.250,2.9,6.4,0.451,0.443,0.8,1.0,0.771,0.7,2.3,3.0,1.7,0.6,0.2,1.6,2.1,7.1
107,Alex English,englial01,DEN,SF,35,82,82,36.5,11.3,22.9,0.491,0.0,0.1,0.250,11.2,22.8,0.492,0.492,4.0,4.6,0.858,1.8,2.2,4.0,4.7,0.8,0.1,2.4,2.1,26.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,Will Perdue,perduwi01,CHI,C,23,30,0,6.3,1.0,2.4,0.403,0.0,0.0,,1.0,2.4,0.403,0.403,0.3,0.5,0.571,0.6,0.9,1.5,0.4,0.1,0.2,0.5,1.3,2.2
18,Willie Anderson,anderwi01,SAS,SF,22,81,79,33.8,7.9,15.9,0.498,0.0,0.3,0.190,7.9,15.6,0.503,0.500,2.8,3.6,0.775,1.9,3.3,5.1,4.6,1.9,0.8,3.2,3.6,18.6
72,Winston Crite,critewi01,PHO,SF,23,2,0,3.0,0.0,1.5,0.000,0.0,0.0,,0.0,1.5,0.000,0.000,0.0,0.0,,0.5,0.0,0.5,0.0,0.0,0.0,0.5,0.5,0.0
119,Winston Garland,garlawi01,GSW,PG,24,79,79,33.7,5.9,13.6,0.434,0.1,0.5,0.233,5.8,13.1,0.442,0.439,2.6,3.2,0.809,1.3,2.9,4.2,6.4,2.2,0.2,2.4,2.7,14.5


In [69]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
134,A.C. Green,greenac01,LAL,PF,25,82,2510,17.8,0.594,0.022,0.474,12.3,20.0,16.4,5.5,1.8,1.2,11.5,17.0,5.8,3.5,9.4,0.179,1.8,-0.2,1.6,2.3
39,Adrian Branch,brancad01,POR,SF,25,67,811,15.7,0.509,0.071,0.275,8.2,9.6,8.8,10.8,2.6,0.2,11.6,26.3,0.7,0.8,1.5,0.087,-0.4,-1.0,-1.4,0.1
78,Adrian Dantley,dantlad01,"DET,DAL",SF,33,73,2422,17.4,0.581,0.001,0.595,5.5,9.0,7.3,10.9,0.9,0.3,11.9,24.3,5.5,1.9,7.4,0.147,1.6,-0.9,0.7,1.6
194,Albert King,kingal01,SAS,SF,29,46,791,9.4,0.470,0.098,0.147,4.4,15.6,9.7,14.3,1.6,0.5,17.5,20.7,-0.7,0.7,0.0,-0.003,-3.2,-0.9,-4.1,-0.4
107,Alex English,englial01,DEN,SF,35,82,2990,19.0,0.531,0.004,0.201,4.8,6.5,5.6,19.7,1.0,0.2,8.8,28.8,5.1,1.7,6.8,0.110,3.0,-2.1,0.9,2.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,Will Perdue,perduwi01,CHI,C,23,30,190,7.2,0.422,0.000,0.194,11.3,15.9,13.7,8.1,1.0,1.9,16.1,21.0,-0.3,0.2,-0.1,-0.021,-6.7,-1.6,-8.3,-0.3
18,Willie Anderson,anderwi01,SAS,SF,22,81,2738,16.6,0.534,0.016,0.225,5.8,11.1,8.4,21.1,2.5,1.4,15.6,23.8,2.1,2.8,4.9,0.085,0.8,0.4,1.3,2.3
72,Winston Crite,critewi01,PHO,SF,23,2,6,-20.3,0.000,0.000,0.000,18.5,0.0,8.9,0.0,0.0,0.0,25.0,26.4,-0.1,0.0,-0.1,-0.570,-18.8,-8.9,-27.7,0.0
119,Winston Garland,garlawi01,GSW,PG,24,79,2661,14.7,0.483,0.040,0.234,3.8,8.5,6.2,24.9,2.9,0.3,13.6,19.5,1.9,2.6,4.5,0.082,-0.7,0.6,-0.1,1.3


In [70]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
4,Atlanta Hawks,52,30,33,8,19,22,38,18,14,12,18,8,20,10,5,7,9,5,28,19,24,11,10,9,29,12,8,6,11,3,7,7,9,4,7,8,10,2
14,Boston Celtics,42,40,32,9,10,31,27,29,15,11,19,11,8,18,8,4,7,7,23,23,19,17,5,7,22,19,8,7,5,7,7,8,6,7,11,4,5,7
23,Charlotte Hornets,20,62,12,29,8,33,12,44,8,18,8,22,4,22,4,8,4,10,13,35,7,27,9,4,5,36,3,10,5,9,3,13,4,8,2,13,3,9
8,Chicago Bulls,47,35,30,11,17,24,28,28,19,7,16,10,12,18,8,4,11,3,27,19,20,16,9,4,21,16,6,8,9,4,9,5,9,4,10,6,4,8
1,Cleveland Cavaliers,57,25,37,4,20,21,38,18,19,7,19,7,19,11,10,2,9,5,35,11,22,14,6,7,39,9,8,3,13,2,11,4,10,3,9,8,6,5
17,Dallas Mavericks,38,44,24,17,14,27,8,16,30,28,6,6,2,10,19,11,11,17,24,21,14,23,6,9,17,26,9,4,8,5,4,11,8,5,3,14,6,5
12,Denver Nuggets,44,38,35,6,9,32,12,12,32,26,7,5,5,7,18,12,14,14,25,23,19,15,7,9,25,20,9,4,8,7,5,11,8,4,8,6,6,6
0,Detroit Pistons,63,19,37,4,26,15,41,15,22,4,21,5,20,10,10,2,12,2,31,13,32,6,6,4,33,10,11,3,9,4,8,6,8,3,16,1,11,2
13,Golden State Warriors,43,39,29,12,14,27,14,10,29,29,9,3,5,7,14,10,15,19,25,20,18,19,5,6,20,21,6,7,5,7,11,5,9,2,9,10,3,8
11,Houston Rockets,45,37,31,10,14,27,12,12,33,25,5,7,7,5,19,11,14,14,28,18,17,19,9,10,22,17,9,6,9,5,7,6,6,6,8,8,6,6
