In [22]:
SEASON = '1974-75'
LEAGUE = 'ABA'

In [23]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH': # Charlotte Hornets
        return 'CHO'
    elif tm == 'DNA': # Denver Nuggets
        return 'DEN'
    elif tm == 'INA': # Indiana Pacers
        return 'IND'
    elif tm == 'NYA': # New York Nets
        return 'NYN'
    elif tm == 'SAA': # San Antonio Spurs
        return 'SAS'
    else:
        return tm

In [24]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
122,Al Skinner,skinnal01,NYN,SG,22,51,,15.2,2.5,5.2,0.489,0.0,0.1,0.333,2.5,5.2,0.490,0.491,1.4,1.8,0.766,0.8,1.5,2.4,2.4,0.6,0.3,1.3,2.2,6.5
123,Al Smith,smithal01,UTS,PG,28,80,,25.5,2.8,7.3,0.387,0.4,1.2,0.362,2.4,6.1,0.391,0.416,2.0,2.4,0.813,0.5,1.4,1.8,4.7,0.7,0.0,2.5,2.9,8.0
136,Art Williams,williar01,SDA,PG,35,7,,12.7,1.1,1.7,0.667,0.0,0.0,,1.1,1.7,0.667,0.667,0.0,0.0,,0.4,1.3,1.7,2.9,1.0,0.0,1.4,2.1,2.3
41,Artis Gilmore,gilmoar01,KEN,C,25,84,,41.6,9.3,16.1,0.580,0.0,0.0,0.500,9.3,16.1,0.580,0.581,4.9,7.0,0.696,5.1,11.1,16.2,2.5,0.8,3.1,4.1,3.8,23.6
105,Aulcie Perry,perryau01,VIR,C,24,21,,19.8,3.9,8.9,0.435,0.0,0.0,0.000,3.9,8.8,0.438,0.435,0.9,1.4,0.633,1.9,3.1,5.0,1.0,0.6,0.8,1.6,2.8,8.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Wendell Ladner,ladnewe01,NYN,SF,26,25,,17.4,1.8,6.9,0.260,0.3,1.4,0.194,1.5,5.5,0.277,0.280,0.2,0.4,0.600,0.8,1.9,2.7,1.6,1.3,0.0,0.9,2.7,4.1
64,Wil Jones,joneswi02,KEN,SF,27,84,,32.0,5.5,11.3,0.483,0.0,0.1,0.000,5.5,11.2,0.486,0.483,1.7,2.3,0.735,2.4,4.9,7.2,3.0,1.3,0.8,2.4,4.2,12.6
35,William Franklin,frankwi01,SAS,PF,25,24,,7.5,1.3,3.5,0.376,0.0,0.0,0.000,1.3,3.5,0.381,0.376,0.6,1.0,0.652,1.6,1.8,3.4,0.4,0.1,0.1,0.8,1.5,3.3
124,Willie Sojourner,sojouwi01,NYN,C,26,79,,12.9,2.0,4.1,0.478,0.0,0.0,0.333,1.9,4.1,0.480,0.480,0.6,0.9,0.700,1.2,2.3,3.5,0.5,0.2,0.8,0.8,2.4,4.6


In [25]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
122,Al Skinner,skinnal01,NYN,SG,22,51,773,14.7,0.542,0.011,0.353,5.6,10.0,7.9,20.0,1.7,0.9,18.1,19.4,0.9,1.1,2.1,0.130,-0.4,0.7,0.3,0.5
123,Al Smith,smithal01,UTS,PG,28,80,2037,9.2,0.481,0.162,0.332,2.1,5.3,3.8,25.5,1.4,0.1,23.2,17.7,-0.3,1.3,0.9,0.022,-2.5,-1.1,-3.6,-0.8
136,Art Williams,williar01,SDA,PG,35,7,89,10.3,0.667,0.000,0.000,3.2,9.9,6.5,26.3,3.6,0.0,45.5,9.5,0.0,0.1,0.1,0.043,-3.7,1.0,-2.7,0.0
41,Artis Gilmore,gilmoar01,KEN,C,25,84,3493,22.6,0.615,0.001,0.438,12.4,26.1,19.4,8.6,0.8,4.1,17.6,22.0,7.9,8.3,16.2,0.222,3.1,1.8,4.9,6.1
105,Aulcie Perry,perryau01,VIR,C,24,21,415,12.2,0.454,0.005,0.161,9.8,16.7,13.2,7.7,1.4,2.1,14.6,22.7,-0.3,0.4,0.1,0.008,-2.7,-0.6,-3.3,-0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Wendell Ladner,ladnewe01,NYN,SF,26,25,436,4.5,0.290,0.208,0.058,5.0,10.7,7.9,10.5,3.4,0.1,11.5,18.4,-1.2,0.8,-0.4,-0.039,-7.1,1.0,-6.1,-0.5
64,Wil Jones,joneswi02,KEN,SF,27,84,2689,13.6,0.512,0.005,0.199,7.5,14.9,11.3,12.8,1.9,1.4,16.2,18.0,1.8,5.1,6.9,0.124,-0.7,1.9,1.1,2.1
35,William Franklin,frankwi01,SAS,PF,25,24,179,12.8,0.415,0.012,0.271,21.9,23.7,22.8,7.4,0.8,0.6,15.9,24.7,-0.1,0.2,0.1,0.039,-3.4,-2.1,-5.6,-0.2
124,Willie Sojourner,sojouwi01,NYN,C,26,79,1020,11.9,0.507,0.009,0.216,9.5,17.6,13.7,5.2,0.7,3.5,15.1,16.4,0.5,1.8,2.4,0.111,-3.3,0.7,-2.6,-0.2


In [26]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
0,Denver Nuggets,65,19,40,2,25,17,31,9,34,10,40,10,25,9,10,1,42,12,5,1,15,3,11,1,10,6,8,5,14,3,2,0
4,Indiana Pacers,45,39,31,11,14,28,25,15,20,24,23,25,22,14,5,9,25,18,1,4,7,8,6,6,10,7,8,5,13,6,0,3
1,Kentucky Colonels,58,26,40,3,18,23,33,11,25,15,33,14,25,12,6,10,38,7,6,1,9,4,8,5,11,4,10,5,12,7,2,0
8,Memphis Sounds,27,57,17,25,10,32,16,28,11,29,14,35,13,22,5,7,7,31,2,5,4,12,3,9,5,10,6,9,6,10,1,2
2,New York Nets,58,26,35,7,23,19,37,7,21,19,34,13,24,13,4,5,36,8,6,3,9,6,10,2,10,2,10,5,11,8,2,0
3,San Antonio Spurs,51,33,32,9,19,24,25,15,26,18,30,24,21,9,6,6,34,17,5,2,9,6,7,10,11,6,7,3,10,6,2,0
7,San Diego Conquistadors,31,53,22,20,9,33,19,21,12,32,21,29,10,24,8,5,14,37,3,5,5,7,5,9,8,9,6,10,4,10,0,3
6,Spirits of St. Louis,32,52,23,19,9,33,17,27,15,25,20,30,12,22,4,2,13,31,3,6,6,10,5,8,6,8,4,11,6,8,2,1
5,Utah Stars,38,46,29,13,9,33,20,20,18,26,22,27,16,19,3,3,16,26,2,6,9,5,8,9,3,9,7,6,7,10,2,1
9,Virginia Squires,15,69,11,31,4,38,7,37,8,32,9,39,6,30,5,8,5,43,3,3,2,14,4,8,1,14,4,11,1,16,0,3
