In [1]:
SEASON = '2019-20'

In [2]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
232,Aaron Gordon,gordoaa01,ORL,PF,24,62,62,32.5,5.4,12.4,...,0.674,1.7,5.9,7.7,3.7,0.8,0.6,1.6,2.0,14.4
290,Aaron Holiday,holidaa01,IND,PG,23,66,33,24.5,3.5,8.5,...,0.851,0.3,2.0,2.4,3.4,0.8,0.2,1.3,1.8,9.5
448,Abdel Nader,naderab01,OKC,SF,26,55,6,15.8,2.2,4.8,...,0.773,0.3,1.6,1.8,0.7,0.4,0.4,0.8,1.4,6.3
427,Adam Mokoka,mokokad01,CHI,SF,21,11,0,10.2,1.1,2.5,...,0.500,0.6,0.3,0.9,0.4,0.4,0.0,0.2,1.5,2.9
539,Admiral Schofield,schofad01,WAS,PF,22,33,2,11.2,1.1,2.8,...,0.667,0.2,1.2,1.4,0.5,0.2,0.1,0.2,1.5,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,Zach LaVine,lavinza01,CHI,SF,24,60,60,34.8,9.0,20.0,...,0.802,0.7,4.1,4.8,4.2,1.5,0.5,3.4,2.2,25.5
458,Zach Norvell,norveza01,"LAL,GSW",SG,22,5,0,8.2,0.6,2.4,...,1.000,0.0,1.2,1.2,0.6,0.4,0.0,0.6,0.8,2.0
556,Zhaire Smith,smithzh01,PHI,SF,20,7,0,4.6,0.4,1.6,...,0.500,0.0,0.3,0.3,0.3,0.4,0.0,0.3,0.6,1.1
639,Zion Williamson,willizi01,NOP,PF,19,24,24,27.8,8.8,15.0,...,0.640,2.7,3.6,6.3,2.1,0.7,0.4,2.5,1.8,22.5


In [3]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
232,Aaron Gordon,gordoaa01,ORL,PF,24,62,2017,15.1,0.516,0.309,...,10.4,20.7,1.4,2.3,3.7,0.087,-0.1,0.1,0.0,1.0
290,Aaron Holiday,holidaa01,IND,PG,23,66,1617,11.6,0.521,0.393,...,12.8,18.7,0.4,1.7,2.1,0.063,-1.6,0.0,-1.6,0.2
448,Abdel Nader,naderab01,OKC,SF,26,55,867,11.3,0.591,0.487,...,12.8,17.0,0.5,0.9,1.3,0.074,-1.2,0.2,-1.0,0.2
427,Adam Mokoka,mokokad01,CHI,SF,21,11,112,8.2,0.538,0.536,...,6.3,12.1,0.1,0.1,0.2,0.073,-2.9,-1.2,-4.1,-0.1
539,Admiral Schofield,schofad01,WAS,PF,22,33,368,6.1,0.502,0.663,...,6.6,11.9,0.1,0.1,0.1,0.017,-4.3,-1.2,-5.5,-0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,Zach LaVine,lavinza01,CHI,SF,24,60,2085,19.4,0.568,0.404,...,13.3,31.7,1.8,2.2,4.0,0.091,3.3,-0.8,2.5,2.4
458,Zach Norvell,norveza01,"LAL,GSW",SG,22,5,41,2.6,0.402,0.667,...,19.4,16.1,-0.1,0.0,-0.1,-0.084,-7.1,-1.2,-8.4,-0.1
556,Zhaire Smith,smithzh01,PHI,SF,20,7,32,1.1,0.313,0.273,...,13.6,19.9,-0.1,0.1,-0.1,-0.120,-10.2,0.4,-9.8,-0.1
639,Zion Williamson,willizi01,NOP,PF,19,24,668,24.1,0.616,0.039,...,11.9,30.5,1.5,0.4,2.0,0.141,3.5,-1.4,2.1,0.7


In [4]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,JulW,JulL,AugW,AugL
25,Atlanta Hawks,20,47,14,20,6,27,11,32,9,15,3,13,2,12,6,7,3,5,3,5,3,5,15,41,5,6,1,5,8,29,2,3,2,13,3,11,6,9,6,7,1,4,,,,
4,Boston Celtics,48,24,26,10,22,14,30,13,18,11,9,6,9,4,12,3,6,3,6,4,6,4,38,16,10,8,6,5,29,7,3,1,10,4,10,3,9,7,9,3,2,3,0.0,1.0,5.0,2.0
13,Brooklyn Nets,35,37,20,16,15,21,23,23,12,14,6,10,8,4,9,9,2,7,6,2,4,5,25,28,10,9,7,7,17,19,1,3,9,6,6,7,5,10,5,7,4,1,0.0,1.0,5.0,2.0
22,Charlotte Hornets,23,42,10,21,13,21,16,24,7,18,5,9,9,8,2,7,1,7,4,4,2,7,18,36,5,6,9,8,5,25,2,3,6,10,5,10,3,9,5,6,2,4,,,,
23,Chicago Bulls,22,43,14,20,8,23,15,28,7,15,1,12,7,9,7,7,1,6,2,5,4,4,19,36,3,7,6,6,7,19,1,4,5,10,7,7,6,11,1,8,2,3,,,,
27,Cleveland Cavaliers,19,46,11,25,8,21,12,32,7,14,2,14,4,10,6,8,4,4,0,4,3,6,14,40,5,6,5,5,6,28,2,2,3,12,5,9,3,13,4,7,2,3,,,,
11,Dallas Mavericks,43,32,20,18,23,14,16,12,27,20,4,5,7,2,5,5,9,6,8,10,10,4,33,22,10,10,2,11,23,9,3,1,9,5,9,6,8,7,7,5,4,3,0.0,1.0,3.0,4.0
5,Denver Nuggets,46,27,26,11,20,16,17,11,29,16,6,4,4,4,7,3,12,2,9,7,8,7,38,17,8,10,9,5,16,13,3,2,10,2,10,6,11,5,6,4,3,3,,,3.0,5.0
26,Detroit Pistons,20,46,11,22,9,24,12,31,8,15,3,11,5,10,4,10,1,7,4,3,3,5,19,38,1,8,4,6,9,26,2,3,4,10,6,9,5,11,3,8,0,5,,,,
29,Golden State Warriors,15,50,8,26,7,24,6,16,9,34,1,6,3,3,2,7,3,12,2,11,4,11,12,43,3,7,1,3,8,30,1,3,3,13,5,10,1,13,3,8,2,3,,,,
