In [26]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv('../data/years/2020-21/raw/2020-21_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv('../data/years/2020-21/clean/2020-21_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
231,Aaron Gordon,gordoaa01,"ORL,DEN",PF,25,50,50,27.7,4.6,10.0,0.463,1.2,3.5,0.335,3.4,6.5,0.533,0.522,1.9,3.0,0.651,1.5,4.1,5.7,3.2,0.7,0.7,1.9,1.8,12.4
293,Aaron Holiday,holidaa01,IND,PG,24,66,8,17.8,2.6,6.6,0.390,1.0,2.8,0.368,1.6,3.8,0.406,0.467,1.0,1.3,0.819,0.2,1.1,1.3,1.9,0.7,0.2,1.0,1.4,7.2
471,Aaron Nesmith,nesmiaa01,BOS,SF,21,46,1,14.5,1.7,3.9,0.438,0.9,2.3,0.370,0.8,1.5,0.543,0.551,0.5,0.6,0.786,0.6,2.2,2.8,0.5,0.3,0.2,0.5,1.9,4.7
469,Abdel Nader,naderab01,PHO,SF,27,24,0,14.8,2.4,4.8,0.491,0.8,1.8,0.419,1.6,3.0,0.534,0.569,1.2,1.5,0.757,0.3,2.3,2.6,0.8,0.4,0.4,0.8,1.4,6.7
454,Adam Mokoka,mokokad01,CHI,SG,22,14,0,4.0,0.5,1.4,0.368,0.1,0.7,0.100,0.4,0.6,0.667,0.395,0.0,0.1,0.000,0.1,0.3,0.4,0.4,0.1,0.1,0.4,0.4,1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Yogi Ferrell,ferreyo01,"CLE,LAC",PG,27,10,0,13.6,2.0,5.7,0.351,0.9,2.8,0.321,1.1,2.9,0.379,0.430,0.7,0.9,0.778,0.5,1.4,1.9,2.2,0.7,0.3,0.3,1.1,5.6
671,Yuta Watanabe,watanyu01,TOR,SF,26,50,4,14.5,1.6,3.6,0.439,0.7,1.8,0.400,0.9,1.8,0.478,0.539,0.5,0.6,0.828,0.7,2.5,3.2,0.8,0.5,0.4,0.4,1.1,4.4
381,Zach LaVine,lavinza01,CHI,SG,25,58,58,35.1,9.8,19.4,0.507,3.4,8.2,0.419,6.4,11.1,0.571,0.596,4.4,5.1,0.849,0.6,4.4,5.0,4.9,0.8,0.5,3.5,2.4,27.4
474,Zeke Nnaji,nnajize01,DEN,PF,20,42,1,9.5,1.2,2.5,0.481,0.6,1.4,0.407,0.6,1.1,0.578,0.596,0.3,0.4,0.800,0.3,1.2,1.5,0.2,0.2,0.1,0.2,0.7,3.2


In [29]:
# ADVANCED PLAYER CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv('../data/years/2020-21/raw/2020-21_Player_Advanced.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv('../data/years/2020-21/clean/2020-21_Player_Advanced.csv', index=False)

df

Player      False
PlayerID    False
Tms         False
Pos         False
Age         False
G           False
MP          False
PER         False
TS%          True
3PAr         True
FTr          True
ORB%        False
DRB%        False
TRB%        False
AST%        False
STL%        False
BLK%        False
TOV%        False
USG%        False
OWS         False
DWS         False
WS          False
WS/48       False
OBPM        False
DBPM        False
BPM         False
VORP        False
dtype: bool

In [28]:
# TEAM STANDINGS CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv('../data/years/2020-21/raw/2020-21_Team_Standings.csv')

pd.set_option('display.max_columns', None)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# Conference columns renaming
df = df.rename(columns={'E': 'East','W': 'West'})

# Division columns renaming
df = df.rename(columns={'A': 'Atlantic', 'C': 'Central',
    'SE': 'Southeast', 'NW': 'Northwest',
    'P': 'Pacific', 'SW': 'Southwest'})

# All-Star columns renaming
df = df.rename(columns={'Pre': 'PreAllStar', 'Post': 'PostAllStar'})

# Margin columns renaming
df = df.rename(columns={'≤3': '≤3Margin', '≥10': '≥10Margin'})

# Month columns renaming
# None needed

# Separate W-L columns into 2 columns: W and L
WinLossColumns = df.columns.drop('Team')
for col in WinLossColumns:
    df[['{}W'.format(col), '{}L'.format(col)]] = df[col].str.split('-', n=1, expand=True)
df = df.drop(WinLossColumns, axis=1)

# Sort by team name, stylistic choice
df = df.sort_values('Team')

# write clean data to storage
df.to_csv('../data/years/2020-21/clean/2020-21_Team_Standings.csv', index=False)

df

Unnamed: 0,Team,OverallW,OverallL,HomeW,HomeL,RoadW,RoadL,EastW,EastL,WestW,WestL,AtlanticW,AtlanticL,CentralW,CentralL,SoutheastW,SoutheastL,NorthwestW,NorthwestL,PacificW,PacificL,SouthwestW,SouthwestL,PreAllStarW,PreAllStarL,PostAllStarW,PostAllStarL,≤3MarginW,≤3MarginL,≥10MarginW,≥10MarginL,DecW,DecL,JanW,JanL,FebW,FebL,MarW,MarL,AprW,AprL,MayW,MayL
10,Atlanta Hawks,41,31,25,11,16,20,24,18,17,13,7,8,8,7,9,3,5,5,6,4,6,4,16,20,25,11,3,3,25,15,3,1,7,8,4,11,9,4,11,6,7,1
15,Boston Celtics,36,36,21,15,15,21,20,22,16,14,4,8,7,8,9,6,6,4,6,4,4,6,19,17,17,19,7,5,17,12,3,2,7,6,7,9,6,8,11,5,2,6
3,Brooklyn Nets,48,24,28,8,20,16,26,16,22,8,8,4,9,6,9,6,7,3,9,1,6,4,24,13,24,11,3,5,24,11,3,2,10,7,9,4,11,2,10,6,5,3
18,Charlotte Hornets,33,39,18,18,15,21,20,22,13,17,4,11,8,7,8,4,4,6,4,6,5,5,17,18,16,21,9,4,14,24,2,2,7,9,7,6,8,5,6,10,3,7
20,Chicago Bulls,31,41,15,21,16,20,21,21,10,20,7,8,7,5,7,8,3,7,1,9,6,4,16,18,15,23,3,5,16,20,2,3,5,8,8,6,4,10,7,10,5,4
25,Cleveland Cavaliers,22,50,13,23,9,27,16,26,6,24,8,7,4,8,4,11,2,8,0,10,4,6,14,22,8,28,4,2,11,37,3,2,6,9,4,10,4,9,4,12,1,8
7,Dallas Mavericks,42,30,21,15,21,15,21,9,21,21,6,4,6,4,9,1,8,7,6,9,7,5,18,16,24,14,5,4,26,17,1,3,7,9,8,4,9,5,10,6,7,3
4,Denver Nuggets,47,25,25,11,22,14,21,9,26,16,5,5,9,1,7,3,9,3,6,9,11,4,21,15,26,10,6,4,29,11,1,3,11,5,6,7,11,3,13,3,5,4
28,Detroit Pistons,20,52,13,23,7,29,12,30,8,22,7,8,1,11,4,11,2,8,3,7,3,7,10,26,10,26,1,8,14,31,0,4,5,11,4,10,4,9,6,10,1,8
13,Golden State Warriors,39,33,25,11,14,22,14,16,25,17,3,7,8,2,3,7,10,5,5,7,10,5,19,18,20,15,6,4,25,18,2,2,9,7,8,7,4,8,8,8,8,1
