In [2]:
# PER GAME CLEANING

import pandas as pd
import myconstants as c

df = pd.read_csv('../data/years/2020-21/raw/2020-21_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + c.EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df['Player'] = df['Player'].replace(c.EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df['Player'] = df['Player'].replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df['Pos'] = df['Pos'].replace(c.EVERYTHING_AFTER_DASH, '', regex=True)
df['Pos'] = df['Pos'].replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = c.ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df['Tms'] = df['Tms'].replace('TOT,', '', regex=True)
df['Tms'] = df['Tms'].replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

# No other columns need to be cleaned

# Move [PlayerID] to the right of [Player] and [Tms] to the right of [PlayerID], stylistic choice
col = df.pop('PlayerID')
df.insert(df.columns.get_loc('Player') + 1, col.name, col)
col = df.pop('Tms')
df.insert(df.columns.get_loc('PlayerID') + 1, col.name, col)

# Sort by player name, stylistic choice
df = df.sort_values('Player')

# write clean data to storage
df.to_csv('../data/years/2020-21/clean/2020-21_Player_PerGame.csv', index=False)

df

Unnamed: 0,Player,PlayerID,Tms,Pos,Age,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
231,Aaron Gordon,gordoaa01,"ORL,DEN",PF,25,50,50,27.7,4.6,10.0,...,0.651,1.5,4.1,5.7,3.2,0.7,0.7,1.9,1.8,12.4
293,Aaron Holiday,holidaa01,IND,PG,24,66,8,17.8,2.6,6.6,...,0.819,0.2,1.1,1.3,1.9,0.7,0.2,1.0,1.4,7.2
471,Aaron Nesmith,nesmiaa01,BOS,SF,21,46,1,14.5,1.7,3.9,...,0.786,0.6,2.2,2.8,0.5,0.3,0.2,0.5,1.9,4.7
469,Abdel Nader,naderab01,PHO,SF,27,24,0,14.8,2.4,4.8,...,0.757,0.3,2.3,2.6,0.8,0.4,0.4,0.8,1.4,6.7
454,Adam Mokoka,mokokad01,CHI,SG,22,14,0,4.0,0.5,1.4,...,0.000,0.1,0.3,0.4,0.4,0.1,0.1,0.4,0.4,1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Yogi Ferrell,ferreyo01,"CLE,LAC",PG,27,10,0,13.6,2.0,5.7,...,0.778,0.5,1.4,1.9,2.2,0.7,0.3,0.3,1.1,5.6
671,Yuta Watanabe,watanyu01,TOR,SF,26,50,4,14.5,1.6,3.6,...,0.828,0.7,2.5,3.2,0.8,0.5,0.4,0.4,1.1,4.4
381,Zach LaVine,lavinza01,CHI,SG,25,58,58,35.1,9.8,19.4,...,0.849,0.6,4.4,5.0,4.9,0.8,0.5,3.5,2.4,27.4
474,Zeke Nnaji,nnajize01,DEN,PF,20,42,1,9.5,1.2,2.5,...,0.800,0.3,1.2,1.5,0.2,0.2,0.1,0.2,0.7,3.2


In [3]:
# TEAM STANDINGS CLEANING

import pandas as pd

df = pd.read_csv('../data/years/2020-21/raw/2020-21_Team_Standings.csv')

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

df

Unnamed: 0,Team,Overall,Home,Road,E,W,A,C,SE,NW,...,Pre,Post,≤3,≥10,Dec,Jan,Feb,Mar,Apr,May
0,Utah Jazz,52-20,31-5,21-15,24-6,28-14,7-3,10-0,7-3,7-5,...,27-9,25-11,3-2,39-7,2-2,13-3,12-2,9-4,9-7,7-2
1,Phoenix Suns,51-21,27-9,24-12,21-9,30-12,7-3,8-2,6-4,11-4,...,24-11,27-10,9-3,23-8,4-1,6-7,12-3,11-3,12-4,6-3
2,Philadelphia 76ers,49-23,29-7,20-16,31-11,18-12,10-2,8-7,13-2,5-5,...,24-12,25-11,7-3,27-12,4-1,11-5,7-6,10-3,10-6,7-2
3,Brooklyn Nets,48-24,28-8,20-16,26-16,22-8,8-4,9-6,9-6,7-3,...,24-13,24-11,3-5,24-11,3-2,10-7,9-4,11-2,10-6,5-3
4,Denver Nuggets,47-25,25-11,22-14,21-9,26-16,5-5,9-1,7-3,9-3,...,21-15,26-10,6-4,29-11,1-3,11-5,6-7,11-3,13-3,5-4
5,Los Angeles Clippers,47-25,26-10,21-15,20-10,27-15,4-6,9-1,7-3,10-5,...,24-14,23-11,3-4,31-7,4-1,12-4,8-7,8-5,11-4,4-4
6,Milwaukee Bucks,46-26,26-10,20-16,30-12,16-14,8-7,11-1,11-4,6-4,...,22-14,24-12,6-5,29-12,2-3,9-5,10-5,9-4,9-7,7-2
7,Dallas Mavericks,42-30,21-15,21-15,21-9,21-21,6-4,6-4,9-1,8-7,...,18-16,24-14,5-4,26-17,1-3,7-9,8-4,9-5,10-6,7-3
8,Los Angeles Lakers,42-30,21-15,21-15,17-13,25-17,4-6,8-2,5-5,10-5,...,24-13,18-17,7-5,22-13,3-2,12-4,9-5,6-7,6-9,6-3
9,Portland Trail Blazers,42-30,20-16,22-14,19-11,23-19,7-3,6-4,6-4,6-6,...,21-14,21-16,10-9,19-12,2-2,8-6,8-6,11-4,6-10,7-2
