In [32]:
SEASON = '1973-74'
LEAGUE = 'ABA'

In [33]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH': # Charlotte Hornets
        return 'CHO'
    elif tm == 'DNA': # Denver Nuggets
        return 'DEN'
    elif tm == 'INA': # Indiana Pacers
        return 'IND'
    elif tm == 'NYA': # New York Nets
        return 'NYN'
    elif tm == 'SAA': # San Antonio Spurs
        return 'SAS'
    else:
        return tm

In [34]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
147,Al Smith,smithal01,DNR,PG,27,76,,32.0,4.1,10.3,0.399,0.3,0.9,0.306,3.8,9.3,0.409,0.413,2.5,3.2,0.773,0.7,2.4,3.2,8.1,1.3,0.1,3.6,3.4,10.9
47,Artis Gilmore,gilmoar01,KEN,C,24,84,,41.7,7.4,15.0,0.493,0.0,0.0,0.000,7.4,15.0,0.494,0.493,3.9,5.8,0.667,5.7,12.6,18.3,3.9,0.7,3.4,3.8,3.6,18.7
120,Barry Parkhill,parkhba01,VIR,SG,22,60,,14.5,1.9,5.2,0.371,0.1,0.3,0.188,1.9,4.9,0.381,0.376,0.8,1.0,0.820,0.2,0.9,1.1,1.6,0.5,0.2,1.3,2.5,4.7
164,Bernie Williams,willibe01,VIR,SG,28,6,,8.5,1.0,3.2,0.316,0.2,0.3,0.500,0.8,2.8,0.294,0.342,0.3,0.3,1.000,0.0,0.7,0.7,1.2,0.2,0.0,0.8,0.5,2.5
75,Bill Keller,kellebi01,IND,PG,26,75,,19.0,3.7,8.2,0.454,0.7,1.7,0.382,3.1,6.5,0.473,0.494,1.4,1.6,0.870,0.6,1.1,1.7,2.3,0.5,0.0,1.3,1.1,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Wilbert Robinson,robinwi01,MMT,SG,24,45,,21.2,3.7,8.9,0.413,0.0,0.1,0.000,3.7,8.8,0.419,0.413,1.3,1.5,0.851,0.6,1.1,1.8,2.9,1.1,0.2,2.0,2.8,8.6
91,Willie Long,longwi01,DNR,SF,23,82,,25.1,4.7,11.3,0.414,0.0,0.0,0.000,4.7,11.3,0.415,0.414,3.3,4.0,0.831,2.4,3.3,5.7,1.2,0.7,0.2,1.9,3.0,12.6
148,Willie Sojourner,sojouwi01,NYN,C,25,82,,16.0,2.5,5.1,0.482,0.0,0.0,0.000,2.5,5.1,0.486,0.482,0.7,0.8,0.844,1.3,2.7,4.1,0.7,0.3,1.1,1.0,2.5,5.6
169,Willie Wise,wisewi01,UTS,SF,26,82,,40.1,8.7,17.8,0.490,0.0,0.2,0.125,8.7,17.6,0.494,0.490,4.8,6.1,0.790,2.1,5.5,7.6,3.7,1.4,0.5,2.5,3.0,22.3


In [35]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
147,Al Smith,smithal01,DNR,PG,27,76,2435,12.0,0.469,0.092,0.311,2.2,7.8,5.0,34.0,1.9,0.2,23.6,18.4,0.0,1.8,1.8,0.036,-1.1,-0.8,-1.9,0.1
47,Artis Gilmore,gilmoar01,KEN,C,24,84,3502,20.7,0.531,0.002,0.388,12.9,28.2,20.6,12.8,0.8,4.3,17.8,19.8,4.8,7.9,12.7,0.174,1.8,2.2,4.0,5.4
120,Barry Parkhill,parkhba01,VIR,SG,22,60,869,5.6,0.420,0.052,0.197,1.5,6.0,3.7,15.5,1.5,0.7,19.2,18.6,-1.1,0.3,-0.8,-0.046,-5.4,-0.9,-6.3,-1.0
164,Bernie Williams,willibe01,VIR,SG,28,6,51,4.8,0.377,0.105,0.105,0.0,7.8,3.9,18.9,0.9,0.0,20.1,18.9,-0.1,0.0,-0.1,-0.088,-4.0,-0.7,-4.7,-0.1
75,Bill Keller,kellebi01,IND,PG,26,75,1428,15.4,0.534,0.213,0.200,2.9,5.5,4.2,17.8,1.2,0.1,13.0,21.0,2.6,1.0,3.6,0.120,1.5,-0.9,0.6,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Wilbert Robinson,robinwi01,MMT,SG,24,45,956,10.8,0.451,0.015,0.167,3.0,5.6,4.3,20.4,2.6,0.5,17.6,22.4,-0.5,0.3,-0.1,-0.006,-3.4,-0.6,-3.9,-0.5
91,Willie Long,longwi01,DNR,SF,23,82,2058,13.7,0.485,0.002,0.351,9.3,13.5,11.3,7.0,1.2,0.3,12.8,23.0,1.8,1.7,3.5,0.082,-0.9,-1.1,-2.0,0.0
148,Willie Sojourner,sojouwi01,NYN,C,25,82,1316,12.3,0.512,0.007,0.153,8.4,16.6,12.6,5.2,0.8,3.7,16.0,15.7,0.8,2.2,3.0,0.109,-2.4,1.2,-1.3,0.2
169,Willie Wise,wisewi01,UTS,SF,26,82,3292,20.1,0.544,0.011,0.344,5.6,13.7,9.8,13.8,1.7,0.7,10.7,23.9,8.0,2.5,10.5,0.153,2.8,-0.5,2.3,3.6


In [36]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL
3,Carolina Cougars,47,37,29,13,18,24,29,15,18,22,33,23,14,14,8,11,24,16,8,4,11,4,6,7,8,8,9,7,5,7
6,Denver Rockets,37,47,24,18,13,29,17,23,20,24,23,26,14,21,5,8,20,19,5,4,7,7,5,7,6,8,8,10,6,11
4,Indiana Pacers,46,38,31,11,15,27,22,18,24,20,28,25,18,13,9,9,17,11,4,4,8,9,9,5,7,7,8,9,10,4
1,Kentucky Colonels,53,31,30,12,23,19,30,14,23,17,31,19,22,12,14,9,26,7,8,1,8,6,7,7,8,5,9,6,13,6
9,Memphis Tams,21,63,14,28,7,35,7,37,14,26,13,41,8,22,4,14,7,32,4,5,5,11,1,13,3,12,4,9,4,13
0,New York Nets,55,29,31,11,24,18,29,15,26,14,34,20,21,9,8,7,33,13,4,5,8,7,11,4,11,4,8,5,13,4
5,San Antonio Spurs,45,39,27,15,18,24,21,19,24,20,28,26,17,13,11,8,16,19,6,8,7,5,7,6,8,7,6,7,11,6
7,San Diego Conquistadors,37,47,20,22,17,25,23,17,14,30,22,35,15,12,9,10,15,21,3,6,6,9,9,8,4,12,8,3,7,9
2,Utah Stars,51,33,33,9,18,24,23,17,28,16,35,19,16,14,11,4,23,20,5,6,8,6,9,3,13,4,10,5,6,9
8,Virginia Squires,28,56,22,20,6,36,15,29,13,27,17,30,11,26,6,5,8,31,2,6,5,9,4,8,6,7,5,14,6,12
