In [114]:
import pandas as pd  # This is always assumed but is included here as an introduction.
import numpy as np
import matplotlib.pyplot as plt

# TODO: move these to a config file
EVERYTHING_AFTER_BACKSLASH = '(?<=\\\\).+' # regex for anything after a backslash
EVERYTHING_AFTER_DASH = '(?<=-).+' # regex for anything after a dash
ALPHABETICALLY_LAST_STRING = 'ZZZ' # string that comes last alphabetically in a sort

df = pd.read_csv('../data/years/2020-21/raw/2020-21_Player_PerGame.csv')

# Add column [PlayerID]
df['PlayerID'] = df['Player'].str.extract('(' + EVERYTHING_AFTER_BACKSLASH + ')')

# [Player] cleaning -- remove backslash + player ID
df = df.replace(EVERYTHING_AFTER_BACKSLASH, '', regex=True)
df = df.replace('\\\\', '', regex=True)

# [Rk] drop -- unnecessary
df = df.drop('Rk', 1)

# [Pos] cleaning -- for dual positions (ex. PG-C) take the first position, discard the second
df = df.replace(EVERYTHING_AFTER_DASH, '', regex=True)
df = df.replace('-', '', regex=True)

# [Age] cleaning
# None

# [Tm] cleaning -- for players that have a TOT (total) row, remove the player's other rows,
# also combine those players' teams on the season and replace TOT with them
df['Tms'] = df.groupby('Player')['Tm'].transform(','.join) # has all teams a player was on
# to remove the duplicate player rows and keep the TOT row, sort in a way that makes the TOT row the last in the df,
# then when dropping duplicate player rows, we can say "keep the last duplicate row in the df", thus keeping the TOT row.
df.loc[df['Tm'] == 'TOT', 'Tm'] = ALPHABETICALLY_LAST_STRING
df = df.sort_values('Tm', ascending=True)
df = df.drop_duplicates('Player', keep='last')
# remove TOT and any commas from [Tms]
df = df.replace('TOT,', '', regex=True)
df = df.replace(',TOT$', '', regex=True)
df = df.drop('Tm', axis=1) # [Tm] no longer needed

df

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PlayerID,Tms
290,Solomon Hill,PF,29,71,16,21.3,1.5,4.2,0.359,1.0,3.0,0.321,0.5,1.2,0.453,0.473,0.5,0.6,0.761,0.6,2.4,3.0,1.1,0.7,0.2,0.6,1.6,4.5,hillso01,ATL
366,Nathan Knight,PF,23,33,0,8.5,1.1,3.0,0.370,0.2,1.0,0.182,0.9,2.0,0.463,0.400,1.3,1.7,0.800,0.8,1.4,2.2,0.2,0.3,0.3,0.5,0.7,3.8,knighna01,ATL
182,Kris Dunn,PG,26,4,0,11.3,0.3,3.0,0.083,0.0,0.5,0.000,0.3,2.5,0.100,0.083,0.8,1.0,0.750,0.0,1.5,1.5,0.5,0.5,0.5,0.8,2.5,1.3,dunnkr01,ATL
488,Onyeka Okongwu,C,20,50,4,12.0,1.9,3.0,0.644,0.0,0.1,0.000,1.9,2.9,0.658,0.644,0.7,1.1,0.632,1.2,2.0,3.3,0.4,0.5,0.7,0.6,2.0,4.6,okongon01,ATL
545,Cam Reddish,SF,21,26,21,28.8,3.7,10.1,0.365,1.3,4.8,0.262,2.4,5.3,0.460,0.428,2.6,3.2,0.817,0.8,3.2,4.0,1.3,1.3,0.3,1.3,2.6,11.2,reddica01,ATL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,Matt Thomas,SG,26,45,0,7.2,1.1,2.8,0.394,0.6,1.8,0.338,0.5,1.0,0.489,0.500,0.3,0.3,0.857,0.2,0.8,1.0,0.4,0.1,0.0,0.3,0.4,3.1,thomama02,"TOR,UTA"
551,Cameron Reynolds,SF,25,5,0,8.0,1.2,3.6,0.333,0.6,2.4,0.250,0.6,1.2,0.500,0.417,0.0,0.0,,0.4,0.6,1.0,0.4,0.0,0.0,0.0,0.4,3.0,reynoca01,"SAS,HOU"
60,Nemanja Bjelica,PF,32,37,3,16.1,2.4,5.3,0.454,0.7,2.3,0.318,1.7,3.0,0.559,0.523,1.0,1.4,0.725,0.8,2.6,3.4,1.9,0.4,0.1,0.9,1.8,6.5,bjeline01,"SAC,MIA"
370,Luke Kornet,C,25,31,2,11.2,1.4,3.3,0.436,0.5,1.9,0.254,0.9,1.4,0.690,0.510,0.1,0.2,0.500,0.5,1.7,2.2,0.8,0.1,1.0,0.2,0.6,3.4,kornelu01,"CHI,BOS"


In [94]:
#df.loc[df['Player'] == 'Victor Oladipo']
df = df.sort_values('PTS', ascending=False)
df.loc[df['Tm'] == ALPHABETICALLY_LAST_STRING]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PlayerID,Tms
258,James Harden,PG,31,ZZZ,44,43,36.6,7.8,16.7,0.466,...,7.1,7.9,10.8,1.2,0.8,4.0,2.3,24.6,hardeja01,"TOT,HOU,BRK"
655,Nikola Vučević,C,30,ZZZ,70,70,33.5,9.5,19.9,0.477,...,9.6,11.7,3.8,0.9,0.7,1.8,2.0,23.4,vucevni01,"TOT,ORL,CHI"
392,Caris LeVert,SG,26,ZZZ,47,39,31.6,7.6,17.3,0.441,...,3.9,4.6,5.2,1.4,0.6,2.2,2.2,20.2,leverca01,"TOT,BRK,IND"
491,Victor Oladipo,SG,28,ZZZ,33,33,32.7,7.1,17.5,0.408,...,4.5,4.8,4.6,1.4,0.4,2.5,2.5,19.8,oladivi01,"TOT,IND,HOU,MIA"
534,Norman Powell,SG,27,ZZZ,69,58,32.0,6.3,13.3,0.477,...,2.5,3.1,1.9,1.2,0.3,1.7,2.3,18.6,powelno01,"TOT,TOR,POR"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,Shaquille Harrison,SG,27,ZZZ,34,0,9.8,0.8,2.3,0.333,...,1.2,1.4,0.7,0.5,0.1,0.3,1.0,2.1,harrish01,"TOT,UTA,DEN"
589,Chris Silva,PF,24,ZZZ,15,0,6.1,0.7,1.1,0.625,...,1.3,1.8,0.4,0.1,0.4,0.7,1.6,2.1,silvach01,"TOT,MIA,SAC"
628,Sindarius Thornwell,SG,26,ZZZ,21,1,10.3,0.7,2.2,0.326,...,0.7,0.9,1.0,0.6,0.1,0.3,0.7,2.0,thornsi01,"TOT,NOP,ORL"
514,Norvel Pelle,C,27,ZZZ,13,0,6.5,0.6,1.2,0.533,...,1.0,1.5,0.2,0.1,0.7,0.2,1.1,1.5,pelleno01,"TOT,BRK,SAC,NYK"
