In [16]:
SEASON = '1976-77'

In [17]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [18]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
156,Aaron James,jamesaa01,NOJ,SF,24,52,,20.4,4.6,9.3,0.490,,,,4.6,9.3,0.490,0.490,1.7,2.2,0.781,1.1,2.5,3.6,1.1,0.4,0.1,,2.4,10.9
72,Adrian Dantley,dantlad01,BUF,SF,21,77,,36.6,7.1,13.6,0.520,,,,7.1,13.6,0.520,0.520,6.2,7.6,0.818,3.3,4.4,7.6,1.9,1.2,0.2,,2.8,20.3
91,Al Eberhard,eberhal01,DET,SF,24,68,,17.9,2.7,5.6,0.476,,,,2.7,5.6,0.476,0.476,1.6,2.0,0.790,1.1,2.1,3.3,0.7,0.7,0.2,,2.9,6.9
281,Al Skinner,skinnal01,NYN,SG,24,79,,28.6,4.8,11.2,0.431,,,,4.8,11.2,0.431,0.431,2.9,3.7,0.791,1.4,3.2,4.6,3.7,1.3,0.7,,3.5,12.6
94,Alex English,englial01,MIL,SF,23,60,,10.8,2.2,4.6,0.477,,,,2.2,4.6,0.477,0.477,0.8,1.0,0.767,1.1,1.7,2.8,0.4,0.3,0.3,,1.3,5.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,Willie Norwood,norwowi01,SEA,SF,29,76,,21.7,2.8,6.1,0.469,,,,2.8,6.1,0.469,0.469,2.0,2.7,0.733,1.7,2.2,3.8,1.3,0.8,0.1,,2.5,7.7
288,Willie Smith,smithwi02,CHI,PG,23,2,,5.5,0.0,0.5,0.000,,,,0.0,0.5,0.000,0.000,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.5,0.0
352,Willie Wise,wisewi01,DEN,SF,29,75,,18.7,3.2,6.8,0.462,,,,3.2,6.8,0.462,0.462,1.9,2.9,0.651,1.0,2.4,3.4,1.9,0.8,0.2,,2.4,8.2
105,World B. Free,freewo01,PHI,SG,23,78,37.0,28.9,6.0,13.1,0.457,,,,6.0,13.1,0.457,0.457,4.3,5.9,0.720,1.2,1.8,3.0,3.4,1.0,0.3,,2.7,16.3


In [19]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
156,Aaron James,jamesaa01,NOJ,SF,24,52,1059,14.4,0.527,,0.235,5.2,11.7,8.5,8.0,0.8,0.2,,,1.3,0.8,2.1,0.095,-0.4,-1.3,-1.7,0.1
72,Adrian Dantley,dantlad01,BUF,SF,21,77,2816,18.3,0.601,,0.556,8.9,12.1,10.5,7.7,1.4,0.3,,,8.2,1.6,9.8,0.167,2.7,-1.1,1.6,2.5
91,Al Eberhard,eberhal01,DET,SF,24,68,1219,11.5,0.534,,0.363,6.5,12.3,9.4,5.1,1.6,0.6,,,1.0,0.8,1.8,0.069,-2.1,-0.6,-2.7,-0.2
281,Al Skinner,skinnal01,NYN,SG,24,79,2256,15.7,0.490,,0.329,4.8,11.9,8.2,20.8,2.1,1.3,,,0.7,2.6,3.3,0.069,-0.6,0.5,-0.1,1.1
94,Alex English,englial01,MIL,SF,23,60,648,14.8,0.511,,0.217,10.8,16.1,13.4,5.3,1.2,1.4,,,0.8,0.4,1.2,0.090,-0.1,-1.6,-1.7,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,Willie Norwood,norwowi01,SEA,SF,29,76,1647,12.0,0.528,,0.447,7.6,10.8,9.1,8.2,1.7,0.2,,,1.8,1.5,3.3,0.095,-1.1,-0.3,-1.3,0.3
288,Willie Smith,smithwi02,CHI,PG,23,2,11,-3.8,0.000,,0.000,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,-0.067,-4.7,-0.1,-4.9,0.0
352,Willie Wise,wisewi01,DEN,SF,29,75,1403,13.3,0.506,,0.425,5.7,12.6,9.2,13.7,1.8,0.7,,,0.7,1.9,2.5,0.087,-1.3,0.1,-1.2,0.3
105,World B. Free,freewo01,PHI,SG,23,78,2253,16.2,0.517,,0.454,4.6,5.9,5.3,17.4,1.5,0.6,,,2.9,1.8,4.6,0.099,0.3,-1.2,-0.9,0.7


In [20]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
18,Atlanta Hawks,31,51,19,22,12,29,18,22,13,29,9,11,9,11,7,16,6,13,22,34,9,17,11,8,9,24,2,4,5,9,5,12,6,7,7,5,5,10,1,4
7,Boston Celtics,44,38,28,13,16,25,22,18,22,20,9,7,13,11,14,9,8,11,26,27,18,11,9,4,13,22,4,0,6,8,7,7,6,11,8,4,9,6,4,2
19,Buffalo Braves,30,52,23,18,7,34,12,28,18,24,6,10,6,18,10,13,8,11,19,33,11,19,6,8,10,24,2,3,7,9,5,8,3,10,6,7,5,11,2,4
8,Chicago Bulls,44,38,31,10,13,28,25,17,19,21,11,8,14,9,10,10,9,11,23,32,21,6,7,6,19,17,2,3,0,10,9,5,9,10,7,6,12,3,5,1
11,Cleveland Cavaliers,43,39,29,12,14,27,19,21,24,18,11,9,8,12,13,10,11,8,28,24,15,15,5,6,17,16,5,0,10,4,5,9,6,7,5,7,9,8,3,4
1,Denver Nuggets,50,32,36,5,14,27,22,20,28,12,9,11,13,9,15,5,13,7,34,19,16,13,3,5,34,12,4,0,9,5,10,5,9,5,8,5,7,9,3,3
9,Detroit Pistons,44,38,30,11,14,27,23,19,21,19,9,10,14,9,12,8,9,11,32,23,12,15,11,6,16,19,2,4,9,6,9,5,8,6,9,4,5,9,2,4
6,Golden State Warriors,46,36,29,12,17,24,27,15,19,21,13,6,14,9,11,13,8,8,30,24,16,12,5,8,21,15,2,2,7,8,7,5,10,6,8,7,9,6,3,2
3,Houston Rockets,49,33,34,7,15,26,23,17,26,16,10,10,13,7,14,9,12,7,29,23,20,10,9,1,23,19,3,1,8,4,8,6,7,9,7,6,13,5,3,2
15,Indiana Pacers,36,46,25,16,11,30,22,20,14,26,13,6,9,14,9,11,5,15,25,30,11,16,6,5,16,19,3,3,6,8,7,9,6,7,5,7,5,11,4,1
