In [9]:
SEASON = '2014-15'

In [10]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
469,A.J. Price,priceaj01,"IND,CLE,PHO",PG,28,26,0,12.5,2.0,5.3,0.372,0.6,2.2,0.263,1.4,3.1,0.450,0.427,0.6,0.9,0.667,0.2,1.0,1.2,1.8,0.3,0.0,0.5,0.6,5.1
74,Aaron Brooks,brookaa01,CHI,PG,30,82,21,23.0,4.2,10.0,0.421,1.5,3.8,0.387,2.7,6.1,0.442,0.495,1.8,2.1,0.833,0.4,1.6,2.0,3.2,0.7,0.2,1.9,2.3,11.6
223,Aaron Gordon,gordoaa01,ORL,PF,19,47,8,17.0,2.0,4.4,0.447,0.3,1.0,0.271,1.7,3.4,0.500,0.478,0.9,1.3,0.721,1.0,2.6,3.6,0.7,0.4,0.5,0.8,1.8,5.2
448,Adreian Payne,paynead01,"ATL,MIN",PF,23,32,22,23.1,2.8,6.9,0.414,0.0,0.3,0.111,2.8,6.6,0.427,0.416,0.9,1.4,0.652,1.5,3.6,5.1,0.9,0.6,0.3,1.4,2.8,6.7
277,Al Horford,horfoal01,ATL,C,28,76,76,30.5,6.8,12.7,0.538,0.1,0.5,0.306,6.7,12.2,0.547,0.544,1.4,1.9,0.759,1.7,5.4,7.2,3.2,0.9,1.3,1.3,1.6,15.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,Zach LaVine,lavinza01,MIN,PG,19,77,40,24.7,3.7,8.8,0.422,0.7,2.2,0.341,3.0,6.6,0.449,0.465,1.9,2.3,0.842,0.4,2.4,2.8,3.6,0.7,0.1,2.5,2.1,10.1
486,Zach Randolph,randoza01,MEM,PF,33,71,71,32.5,6.4,13.1,0.487,0.1,0.3,0.350,6.3,12.8,0.490,0.491,3.2,4.2,0.765,3.2,7.4,10.5,2.2,1.0,0.2,2.2,2.5,16.1
440,Zaza Pachulia,pachuza01,MIL,C,30,73,45,23.7,3.3,7.2,0.454,0.0,0.0,0.000,3.3,7.2,0.456,0.454,1.7,2.2,0.788,2.7,4.2,6.8,2.4,1.1,0.3,1.8,2.3,8.3
167,Zoran Dragić,dragizo01,"PHO,MIA",SG,25,16,1,4.7,0.7,1.9,0.367,0.2,0.9,0.214,0.5,1.0,0.500,0.417,0.2,0.3,0.600,0.3,0.2,0.5,0.3,0.1,0.0,0.3,0.4,1.8


In [11]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Player_Advanced.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
469,A.J. Price,priceaj01,"IND,CLE,PHO",PG,28,26,324,12.0,0.451,0.416,0.175,2.1,8.9,5.6,23.5,1.1,0.0,8.7,22.5,0.0,0.2,0.3,0.041,-0.3,-1.1,-1.4,0.0
74,Aaron Brooks,brookaa01,CHI,PG,30,82,1885,14.4,0.534,0.383,0.213,1.9,7.5,4.8,24.2,1.5,0.6,14.9,25.0,1.7,1.5,3.3,0.083,0.3,-1.1,-0.8,0.5
223,Aaron Gordon,gordoaa01,ORL,PF,19,47,797,11.4,0.517,0.231,0.293,6.4,17.8,12.0,6.3,1.3,2.2,13.9,15.5,0.3,0.7,1.0,0.060,-2.6,-0.1,-2.7,-0.2
448,Adreian Payne,paynead01,"ATL,MIN",PF,23,32,739,7.7,0.443,0.041,0.209,7.2,18.2,12.5,6.4,1.3,1.0,15.5,17.0,-0.7,0.2,-0.5,-0.033,-5.0,-2.0,-6.9,-0.9
277,Al Horford,horfoal01,ATL,C,28,76,2318,21.4,0.563,0.037,0.146,6.7,19.8,13.4,18.6,1.5,3.5,8.9,22.2,5.1,3.6,8.7,0.179,2.6,1.5,4.1,3.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,Zach LaVine,lavinza01,MIN,PG,19,77,1902,11.3,0.515,0.247,0.261,1.6,11.6,6.4,24.0,1.4,0.4,20.4,22.0,-0.7,0.0,-0.7,-0.018,-2.2,-2.5,-4.6,-1.3
486,Zach Randolph,randoza01,MEM,PF,33,71,2304,19.5,0.538,0.021,0.320,11.2,25.8,18.5,11.4,1.6,0.5,12.8,24.3,3.4,3.8,7.2,0.149,1.4,-0.1,1.3,1.9
440,Zaza Pachulia,pachuza01,MIL,C,30,73,1730,15.6,0.506,0.006,0.302,13.1,19.9,16.5,16.2,2.4,1.0,18.2,19.1,1.3,2.9,4.2,0.116,-0.4,0.8,0.4,1.0
167,Zoran Dragić,dragizo01,"PHO,MIA",SG,25,16,75,8.2,0.435,0.467,0.167,7.8,4.6,6.2,11.2,1.4,0.0,13.4,22.9,-0.1,0.0,-0.1,-0.042,-4.1,-3.6,-7.7,-0.1


In [12]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv(f'../data/years/{SEASON}/raw/{SEASON}_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv(f'../data/years/{SEASON}/clean/{SEASON}_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,OctW,OctL,NovW,NovL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL
1,Atlanta Hawks,60,22,35,6,25,16,38,14,22,8,12,6,14,4,12,4,8,2,8,2,6,4,43,11,17,11,6,4,30,10,0,1,9,5,14,2,17,0,7,4,9,7,4,3
15,Boston Celtics,40,42,21,20,19,22,28,24,12,18,12,4,9,9,7,11,6,4,3,7,3,7,20,31,20,11,8,6,17,14,1,0,3,10,7,8,5,11,7,4,10,8,7,1
17,Brooklyn Nets,38,44,19,22,19,22,24,28,14,16,10,6,7,11,7,11,6,4,6,4,2,8,21,31,17,13,10,2,15,23,0,1,6,8,9,7,3,12,6,5,9,7,5,4
21,Charlotte Hornets,33,49,19,22,14,27,25,27,8,22,10,8,7,11,8,8,5,5,2,8,1,9,22,30,11,19,6,8,13,22,1,0,3,14,6,9,10,4,3,6,8,9,2,7
8,Chicago Bulls,50,32,27,14,23,18,33,19,17,13,16,2,8,8,9,9,6,4,5,5,6,4,34,20,16,12,7,2,20,16,1,1,10,5,11,4,8,9,7,3,8,7,5,3
6,Cleveland Cavaliers,53,29,31,10,22,19,35,17,18,12,12,6,11,5,12,6,6,4,7,3,5,5,33,22,20,7,6,4,30,14,1,1,7,6,10,7,11,6,8,3,11,4,5,2
9,Dallas Mavericks,50,32,27,14,23,18,21,9,29,23,9,1,5,5,7,3,13,5,9,9,7,9,36,19,14,13,5,4,22,18,1,1,12,4,10,5,9,7,7,5,6,7,5,3
23,Denver Nuggets,30,52,19,22,11,30,11,19,19,33,1,9,6,4,4,6,6,10,8,10,5,13,20,33,10,19,4,4,17,28,1,0,7,8,5,11,6,10,1,9,8,8,2,6
22,Detroit Pistons,32,50,18,23,14,27,23,29,9,21,8,10,6,10,9,9,2,8,3,7,4,6,21,33,11,17,7,6,19,25,0,2,3,12,5,9,10,7,5,6,6,9,3,5
0,Golden State Warriors,67,15,39,2,28,13,25,5,42,10,9,1,7,3,9,1,15,3,13,3,14,4,42,9,25,6,5,3,45,9,1,0,13,2,11,3,12,3,8,3,16,2,6,2
