In [1]:
SEASON = '1975-76'
LEAGUE = 'ABA'

In [2]:
# set single team name for teams that have multiple codes
def use_unique_team_code(tm):
    if tm == 'CHH': # Charlotte Hornets
        return 'CHO'
    elif tm == 'DNA': # Denver Nuggets
        return 'DEN'
    elif tm == 'INA': # Indiana Pacers
        return 'IND'
    elif tm == 'NYA': # New York Nets
        return 'NYN'
    elif tm == 'SAA': # San Antonio Spurs
        return 'SAS'
    else:
        return tm

In [3]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
130,Al Skinner,skinnal01,NYN,SG,23,83,,25.1,4.0,8.5,...,0.842,1.2,2.5,3.7,3.4,1.1,0.6,2.0,3.0,10.4
131,Al Smith,smithal01,UTS,PG,29,15,,26.1,2.8,7.0,...,0.814,0.9,1.6,2.5,4.9,0.7,0.1,1.9,3.0,9.2
13,Allan Bristow,bristal01,SAS,SF,24,47,,18.8,2.7,5.8,...,0.848,1.4,2.3,3.7,2.6,0.5,0.0,1.3,1.7,7.0
98,Allen Murphy,murphal01,KEN,SG,23,29,,8.6,1.5,3.9,...,0.730,0.8,0.8,1.6,0.4,0.3,0.3,0.9,1.8,3.9
47,Artis Gilmore,gilmoar01,KEN,C,26,84,,39.1,9.2,16.7,...,0.682,4.8,10.7,15.5,2.5,0.7,2.4,3.5,4.1,24.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Tom Owens,owensto01,"KEN,IND,SAS",C,26,74,,15.0,2.4,5.0,...,0.713,1.6,2.7,4.3,0.9,0.1,0.6,1.0,2.7,6.1
49,Travis Grant,granttr01,"KEN,IND",SF,26,56,,14.8,3.5,7.1,...,0.754,1.1,1.4,2.5,0.8,0.3,0.3,1.0,1.8,8.0
73,Wil Jones,joneswi02,KEN,PF,28,83,,31.7,5.8,12.2,...,0.775,2.9,4.6,7.5,2.5,1.0,0.7,2.4,3.9,13.6
40,William Franklin,frankwi01,SAS,PF,26,10,,9.5,1.2,2.2,...,0.563,1.2,1.7,2.9,0.5,0.3,0.3,0.5,1.6,3.3


In [4]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID + star (for HOF)
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)
df['Player'] = df['Player'].replace(c.STAR_AT_END, '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tm'] = df['Tm'].apply(use_unique_team_code) # some teams have multiple team codes
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
130,Al Skinner,skinnal01,NYN,SG,23,83,2082,14.3,0.535,0.011,...,17.3,17.9,2.5,2.4,4.9,0.114,-0.3,0.9,0.6,1.4
131,Al Smith,smithal01,UTS,PG,29,15,392,12.1,0.527,0.162,...,18.1,16.0,0.6,0.0,0.6,0.071,0.4,-1.4,-1.0,0.1
13,Allan Bristow,bristal01,SAS,SF,24,47,882,13.5,0.527,0.004,...,16.8,16.3,1.2,0.7,1.9,0.104,0.4,-0.7,-0.3,0.4
98,Allen Murphy,murphal01,KEN,SG,23,29,248,8.4,0.434,0.009,...,17.2,24.2,-0.3,0.3,0.0,-0.003,-4.7,-0.5,-5.2,-0.2
47,Artis Gilmore,gilmoar01,KEN,C,26,84,3286,23.5,0.595,0.000,...,14.5,23.6,9.2,5.8,15.1,0.220,3.9,0.8,4.7,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Tom Owens,owensto01,"KEN,IND,SAS",C,26,74,1107,12.8,0.526,0.000,...,14.8,17.2,1.3,1.2,2.5,0.107,-1.7,-0.4,-2.1,-0.1
49,Travis Grant,granttr01,"KEN,IND",SF,26,56,828,14.2,0.523,0.000,...,11.4,21.8,0.9,0.6,1.5,0.085,-0.2,-1.1,-1.3,0.1
73,Wil Jones,joneswi02,KEN,PF,28,83,2635,13.6,0.510,0.006,...,15.3,18.9,2.2,3.1,5.4,0.098,-0.3,0.2,-0.2,1.2
40,William Franklin,frankwi01,SAS,PF,26,10,95,14.0,0.568,0.000,...,14.7,13.8,0.2,0.1,0.3,0.145,-2.1,0.0,-2.0,0.0


In [5]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_{LEAGUE}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_{LEAGUE}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,NeutralW,NeutralL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
0,Denver Nuggets,60,24,39,3,21,21,,,32,11,28.0,13.0,6,5,31,10,3,1,9,4,11.0,3.0,11.0,3.0,11.0,5.0,13.0,7.0,2.0,1.0
4,Indiana Pacers,39,45,23,19,16,26,,,25,21,14.0,24.0,6,8,19,20,4,1,7,5,8.0,7.0,7.0,9.0,5.0,12.0,8.0,7.0,0.0,4.0
3,Kentucky Colonels,46,38,32,10,14,28,,,26,20,20.0,18.0,9,9,19,17,2,1,9,5,5.0,9.0,11.0,6.0,7.0,9.0,10.0,6.0,2.0,2.0
1,New York Nets,55,29,35,6,19,23,1.0,0.0,29,15,26.0,14.0,8,2,27,15,3,2,8,3,8.0,5.0,10.0,7.0,11.0,6.0,11.0,6.0,4.0,0.0
2,San Antonio Spurs,50,34,30,12,20,20,0.0,2.0,24,19,26.0,15.0,3,8,31,13,3,2,7,5,9.0,4.0,7.0,8.0,10.0,5.0,11.0,9.0,3.0,1.0
8,San Diego Sails,3,8,1,2,2,6,,,3,8,,,2,2,1,3,1,4,2,4,,,,,,,,,,
5,Spirits of St. Louis,35,49,21,21,13,28,1.0,0.0,20,27,15.0,22.0,9,6,12,27,3,2,7,8,6.0,10.0,4.0,8.0,9.0,8.0,5.0,10.0,1.0,3.0
7,Utah Stars,4,12,4,5,0,7,,,4,12,,,1,2,2,3,1,2,3,10,,,,,,,,,,
6,Virginia Squires,15,68,13,29,2,39,,,7,37,8.0,31.0,2,4,4,38,0,5,3,11,2.0,11.0,3.0,12.0,4.0,12.0,1.0,14.0,2.0,3.0
