In [46]:
SEASON = '1992-93'

In [47]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH':
        return 'CHO'
    else:
        return tm

In [48]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
152,A.C. Green,greenac01,LAL,PF,29,82,55,34.4,4.6,8.6,0.537,0.2,0.6,0.348,4.4,8.0,0.550,0.548,3.4,4.6,0.739,3.5,5.2,8.7,1.4,1.1,0.5,1.4,1.8,12.8
200,Adam Keefe,keefead01,ATL,PF,22,82,6,18.9,2.3,4.6,0.500,0.0,0.0,0.000,2.3,4.6,0.501,0.500,2.0,2.9,0.700,2.1,3.2,5.3,1.0,0.7,0.2,1.2,2.4,6.6
0,Alaa Abdelnaby,abdelal01,"MIL,BOS",PF,24,75,52,17.5,3.3,6.3,0.518,0.0,0.0,0.000,3.3,6.3,0.519,0.518,1.2,1.5,0.759,1.7,2.8,4.5,0.4,0.3,0.3,1.3,2.5,7.7
284,Alan Ogg,oggal01,"MIL,WSB",C,25,6,0,4.8,0.8,2.2,0.385,0.0,0.0,,0.8,2.2,0.385,0.385,0.5,0.7,0.750,0.5,1.2,1.7,0.7,0.2,0.5,0.5,1.0,2.2
207,Alec Kessler,kesslal01,MIA,PF,26,40,2,10.4,1.4,3.1,0.467,0.1,0.3,0.455,1.3,2.8,0.468,0.488,0.9,1.2,0.766,0.6,1.7,2.3,0.4,0.1,0.3,0.5,1.6,3.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,Willie Anderson,anderwi01,SAS,SG,26,38,7,14.7,2.1,4.9,0.430,0.0,0.2,0.125,2.1,4.7,0.444,0.433,0.6,0.7,0.786,0.2,1.3,1.5,2.1,0.4,0.2,1.2,1.4,4.8
63,Willie Burton,burtowi01,MIA,SF,24,26,8,17.3,2.1,5.4,0.383,0.2,0.6,0.333,1.9,4.8,0.389,0.401,3.5,4.9,0.717,0.8,1.8,2.7,0.6,0.5,0.6,1.9,2.2,7.8
135,Winston Garland,garlawi01,HOU,PG,28,66,4,15.2,2.3,5.2,0.443,0.1,0.2,0.462,2.2,5.0,0.442,0.452,1.2,1.3,0.910,0.5,1.2,1.6,2.1,0.6,0.1,1.0,1.8,5.9
254,Xavier McDaniel,mcdanxa01,BOS,PF,29,82,27,27.0,5.6,11.3,0.495,0.1,0.3,0.273,5.5,11.0,0.500,0.498,2.3,2.9,0.793,2.0,3.9,6.0,2.0,0.9,0.6,2.1,3.0,13.5


In [49]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
152,A.C. Green,greenac01,LAL,PF,29,82,2819,16.3,0.603,0.065,0.531,11.5,17.3,14.4,5.9,1.6,0.9,11.8,15.0,6.1,2.6,8.6,0.147,1.6,-0.9,0.7,1.9
200,Adam Keefe,keefead01,ATL,PF,22,82,1549,13.6,0.564,0.003,0.630,11.8,19.5,15.5,7.0,1.8,0.7,17.2,15.5,1.9,1.4,3.3,0.102,-1.6,-0.6,-2.2,-0.1
0,Alaa Abdelnaby,abdelal01,"MIL,BOS",PF,24,75,1311,13.2,0.551,0.002,0.245,11.0,18.1,14.6,3.0,1.0,1.2,15.6,20.5,0.7,1.3,2.0,0.074,-2.4,-1.5,-3.9,-0.6
284,Alan Ogg,oggal01,"MIL,WSB",C,25,6,29,15.7,0.440,0.000,0.308,11.6,28.0,19.7,19.4,1.7,6.7,16.9,26.2,0.0,0.0,0.0,0.020,-3.9,2.1,-1.8,0.0
207,Alec Kessler,kesslal01,MIA,PF,26,40,415,11.5,0.543,0.090,0.385,6.8,18.6,12.6,5.2,0.5,1.9,12.8,17.1,0.4,0.4,0.8,0.088,-2.3,-0.3,-2.6,-0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,Willie Anderson,anderwi01,SAS,SG,26,38,560,8.7,0.461,0.043,0.151,1.5,9.8,5.8,20.4,1.3,0.7,18.2,19.0,-0.6,0.5,-0.1,-0.007,-3.9,-0.9,-4.7,-0.4
63,Willie Burton,burtowi01,MIA,SF,24,26,451,10.3,0.518,0.106,0.901,5.5,12.5,8.9,5.4,1.5,2.4,20.3,23.7,-0.3,0.4,0.1,0.012,-4.0,-0.9,-4.9,-0.3
135,Winston Garland,garlawi01,HOU,PG,28,66,1004,12.2,0.512,0.038,0.259,3.8,8.1,6.1,20.3,2.0,0.2,14.9,19.8,0.7,1.1,1.8,0.084,-2.4,0.0,-2.5,-0.1
254,Xavier McDaniel,mcdanxa01,BOS,PF,29,82,2215,16.3,0.539,0.024,0.261,8.7,16.2,12.5,11.0,1.6,1.4,14.2,23.6,1.9,2.5,4.4,0.096,-0.1,-0.1,-0.1,1.1


In [50]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,MW,ML,PacificW,PacificL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
11,Atlanta Hawks,43,39,25,16,18,23,26,30,17,9,14,14,12,16,10,2,7,7,24,27,19,12,2,6,21,26,6,6,6,9,9,6,5,8,12,3,5,7
8,Boston Celtics,48,34,28,13,20,21,33,23,15,11,19,9,14,14,8,4,7,7,26,24,22,10,6,2,21,18,5,8,7,9,10,4,7,4,13,3,6,6
10,Charlotte Hornets,44,38,22,19,22,19,29,27,15,11,17,11,12,16,7,5,8,6,26,23,18,15,10,5,14,19,7,6,7,7,6,7,8,5,7,10,9,3
2,Chicago Bulls,57,25,31,10,26,15,39,17,18,8,20,8,19,9,7,5,11,3,35,17,22,8,6,7,31,5,9,3,12,4,7,8,10,2,10,4,9,4
5,Cleveland Cavaliers,54,28,35,6,19,22,37,19,17,9,15,13,22,6,8,4,9,5,34,19,20,9,4,7,34,5,6,7,11,5,8,6,12,1,6,6,11,3
26,Dallas Mavericks,11,71,7,34,4,37,6,22,5,49,3,11,3,11,3,23,2,26,4,45,7,26,5,1,2,58,1,9,1,13,1,14,1,12,3,14,4,9
18,Denver Nuggets,36,46,28,13,8,33,11,17,25,29,5,9,6,8,13,13,12,16,20,30,16,16,5,6,23,23,5,7,2,12,8,7,7,6,7,9,7,5
16,Detroit Pistons,40,42,28,13,12,29,31,25,9,17,19,9,12,16,4,8,5,9,21,29,19,13,9,8,13,18,3,9,11,3,5,11,4,7,9,7,8,5
20,Golden State Warriors,34,48,19,22,15,26,9,19,25,29,6,8,3,11,16,8,9,21,23,30,11,18,6,10,19,25,5,8,10,5,5,9,4,10,4,9,6,7
3,Houston Rockets,55,27,31,10,24,17,19,9,36,18,9,5,10,4,19,7,17,11,30,21,25,6,6,4,30,13,7,4,7,7,10,7,9,3,11,4,11,2
