In [11]:
SEASON = '1980-81'

In [12]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [13]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
152,Abdul Jeelani,jeelaab01,DAL,SF,26,66,,16.8,2.8,6.7,0.425,0.0,0.0,0.000,2.8,6.7,0.426,0.425,2.7,3.3,0.814,1.3,2.2,3.5,1.0,0.7,0.5,1.3,1.9,8.4
67,Adrian Dantley,dantlad01,UTA,SF,25,80,,42.7,11.4,20.3,0.559,0.0,0.1,0.286,11.3,20.3,0.560,0.559,7.9,9.8,0.806,2.4,4.0,6.4,4.0,1.4,0.2,3.5,3.1,30.7
123,Alan Hardy,hardyal01,LAL,SG,23,22,,5.0,1.0,2.7,0.373,0.0,0.0,,1.0,2.7,0.373,0.373,0.3,0.5,0.700,0.4,0.5,0.9,0.1,0.0,0.4,0.5,0.6,2.3
93,Alex English,englial01,DEN,SF,27,81,,38.2,9.5,19.2,0.494,0.0,0.1,0.600,9.4,19.1,0.494,0.495,4.8,5.7,0.850,3.4,4.6,8.0,3.6,1.3,1.2,3.0,3.1,23.8
27,Allan Bristow,bristal01,UTA,SF,29,82,,24.4,3.3,7.5,0.444,0.1,0.2,0.278,3.2,7.2,0.449,0.448,2.0,2.4,0.838,1.3,4.0,5.2,4.7,0.8,0.0,2.1,2.3,8.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,Wayne Robinson,robinwa01,DET,PF,22,81,,19.7,2.9,6.3,0.460,0.0,0.1,0.000,2.9,6.2,0.465,0.460,2.2,3.0,0.729,1.4,2.2,3.6,1.4,0.6,0.3,1.8,2.3,7.9
214,Wes Matthews,matthwe01,"WSB,ATL",PG,21,79,,28.7,4.9,9.9,0.494,0.1,0.3,0.238,4.8,9.6,0.501,0.497,2.6,3.2,0.802,0.6,1.2,1.8,5.2,1.4,0.2,3.3,3.1,12.4
324,Wes Unseld,unselwe01,WSB,C,34,63,,32.3,3.6,6.8,0.524,0.0,0.1,0.500,3.5,6.7,0.525,0.527,0.9,1.4,0.640,3.3,7.4,10.7,2.7,0.8,0.6,1.5,2.7,8.0
19,Winford Boynes,boynewi01,DAL,SG,23,44,,17.2,2.8,7.1,0.387,0.0,0.0,,2.8,7.1,0.387,0.387,1.0,1.3,0.818,0.5,1.2,1.7,0.8,0.5,0.4,1.6,1.8,6.5


In [14]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
152,Abdul Jeelani,jeelaab01,DAL,SF,26,66,1108,15.5,0.515,0.002,0.500,8.2,15.7,11.8,9.2,1.9,1.6,13.9,23.5,0.8,0.8,1.6,0.070,-0.5,-0.7,-1.2,0.2
67,Adrian Dantley,dantlad01,UTA,SF,25,80,3417,24.3,0.622,0.004,0.482,6.5,10.5,8.5,16.3,1.6,0.3,12.5,28.4,12.3,1.3,13.6,0.191,5.7,-1.4,4.3,5.4
123,Alan Hardy,hardyal01,LAL,SG,23,22,111,7.5,0.402,0.000,0.169,8.3,10.1,9.3,3.6,0.4,4.3,14.8,26.9,-0.3,0.1,-0.2,-0.072,-5.1,-1.2,-6.3,-0.2
93,Alex English,englial01,DEN,SF,27,81,3093,19.7,0.549,0.003,0.295,8.8,12.6,10.7,13.4,1.5,1.6,12.1,24.0,6.7,1.6,8.3,0.129,3.1,-1.0,2.2,3.2
27,Allan Bristow,bristal01,UTA,SF,29,82,2001,14.9,0.511,0.029,0.324,6.0,18.5,12.3,27.0,1.5,0.1,19.7,18.7,1.4,1.3,2.7,0.065,0.3,-0.5,-0.2,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,Wayne Robinson,robinwa01,DET,PF,22,81,1592,11.5,0.523,0.012,0.472,8.1,13.7,10.7,10.5,1.4,0.9,19.5,19.4,0.2,1.5,1.7,0.052,-2.0,-0.7,-2.7,-0.3
214,Wes Matthews,matthwe01,"WSB,ATL",PG,21,79,2266,13.1,0.549,0.027,0.323,2.2,4.6,3.4,26.2,2.3,0.4,22.7,20.7,0.7,1.9,2.6,0.055,-0.9,-0.3,-1.2,0.5
324,Wes Unseld,unselwe01,WSB,C,34,63,2032,13.5,0.543,0.009,0.200,10.6,24.3,17.4,10.6,1.2,1.0,17.2,11.1,2.1,3.4,5.5,0.130,0.0,1.8,1.7,1.9
19,Winford Boynes,boynewi01,DAL,SG,23,44,757,6.1,0.426,0.000,0.176,3.5,8.0,5.6,7.5,1.5,1.2,17.0,22.4,-1.6,0.2,-1.4,-0.091,-5.3,-2.0,-7.3,-1.0


In [15]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL
17,Atlanta Hawks,31,51,20,21,11,30,18,40,13,11,9,19,9,21,8,4,5,7,19,34,12,17,7,18,9,18,5,5,3,11,10,5,1,13,5,7,7,10
0,Boston Celtics,62,20,35,6,27,14,42,16,20,4,19,5,23,11,10,2,10,2,43,10,19,10,11,6,29,9,6,3,9,4,14,1,14,2,8,5,11,5
7,Chicago Bulls,45,37,26,15,19,22,34,24,11,13,14,15,20,9,7,5,4,8,27,27,18,10,7,7,26,15,3,7,7,7,9,6,8,7,7,8,11,2
18,Cleveland Cavaliers,28,54,20,21,8,33,19,39,9,15,8,20,11,19,5,7,4,8,22,32,6,22,7,9,12,27,4,8,4,10,5,9,9,5,3,9,3,13
22,Dallas Mavericks,15,67,11,30,4,37,3,19,12,48,2,8,1,11,5,25,7,23,8,45,7,22,4,8,4,35,2,10,1,12,2,14,3,9,1,13,6,9
14,Denver Nuggets,37,45,23,18,14,27,9,13,28,32,2,8,7,5,13,17,15,15,20,32,17,13,11,14,16,13,2,7,8,7,3,10,7,8,7,6,10,7
21,Detroit Pistons,21,61,14,27,7,34,16,42,5,19,7,21,9,21,4,8,1,11,13,42,8,19,8,13,6,27,1,9,7,8,2,12,3,13,3,10,5,9
12,Golden State Warriors,39,43,26,15,13,28,11,11,28,32,5,5,6,6,18,12,10,20,27,25,12,18,10,13,9,14,6,5,7,6,8,7,6,7,6,9,6,9
10,Houston Rockets,40,42,25,16,15,26,8,14,32,28,2,8,6,6,19,11,13,17,24,29,16,13,12,14,13,13,4,5,7,8,5,9,8,7,9,5,7,8
9,Indiana Pacers,44,38,27,14,17,24,32,26,12,12,15,14,17,12,9,3,3,9,31,23,13,15,6,13,17,15,7,3,7,8,7,7,10,5,6,7,7,8
