In [2]:
import pandas as pd

In [3]:
team_data_df : pd.DataFrame = pd.read_csv('../data/team_data_1991-2022.csv')

In [4]:
mvp_voting_df : pd.DataFrame= pd.read_csv('../data/mvp_voting_1991-2022.csv')
# Remove player stat data from the MVP voting data 
# since we already have this information in the player stats csv
voting_data_cols = ['Player', 'Year', 'Pts Won', 'Pts Max', 'Share']
mvp_voting_df = mvp_voting_df[voting_data_cols]


In [5]:
player_stats_df : pd.DataFrame = pd.read_csv('../data/player_stats_1991-2022.csv')
# Delete extraneous columns from the player data
del player_stats_df['Rk']
del player_stats_df['Unnamed: 0']

# Delete any asterisks on player names for successful combination
player_stats_df['Player'] = player_stats_df['Player'].str.replace('*','',regex=False)

# There are multiple entries for players that play for multiple teams into a single entry
player_stats_df.groupby(['Player','Year']).get_group(('Greg Anderson',1991)) 

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
8,Greg Anderson,PF,26,TOT,68,2,13.6,1.7,4.0,0.43,...,1.4,3.3,4.7,0.2,0.5,0.7,1.2,2.1,4.3,1991
9,Greg Anderson,PF,26,MIL,26,0,9.5,1.0,2.8,0.37,...,1.0,1.9,2.9,0.1,0.3,0.3,0.8,1.1,2.7,1991
10,Greg Anderson,PF,26,NJN,1,0,18.0,4.0,4.0,1.0,...,4.0,2.0,6.0,1.0,2.0,0.0,1.0,4.0,8.0,1991
11,Greg Anderson,PF,26,DEN,41,2,16.1,2.1,4.7,0.44,...,1.6,4.1,5.8,0.3,0.6,0.9,1.5,2.6,5.2,1991


In [6]:
# Combine the grouped data so that each player only has one entry per season 
def single_row(df : pd.DataFrame):
    if df.shape[0] == 1: # If there is only one entry, we are all set
        return df
    else: # If there are multiple entries, the entry for the team 'TOT' has the player's total stats
        row = df[df['Tm'] == 'TOT'] # Use the TOT entry for the total stats
        # Replace the team TOT with the player's most recent team 
        row['Tm'] = df.iloc[-1, :]['Tm'] 

player_stats_df = player_stats_df.groupby(['Player', 'Year']).apply(single_row)

In [7]:
player_stats_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
A.C. Green,1991,164,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,2.5,3.8,6.3,0.9,0.7,0.3,1.2,1.4,9.1,1991
A.C. Green,1992,633,A.C. Green,PF,28,LAL,82,53,35.4,4.7,9.8,0.476,...,3.7,5.6,9.3,1.4,1.1,0.4,1.4,1.7,13.6,1992
A.C. Green,1993,1092,A.C. Green,PF,29,LAL,82,55,34.4,4.6,8.6,0.537,...,3.5,5.2,8.7,1.4,1.1,0.5,1.4,1.8,12.8,1993
A.C. Green,1994,1579,A.C. Green,PF,30,PHO,82,55,34.5,5.7,11.3,0.502,...,3.4,5.8,9.2,1.7,0.9,0.5,1.2,1.7,14.7,1994
A.C. Green,1995,2067,A.C. Green,SF,31,PHO,82,52,32.8,3.8,7.5,0.504,...,2.4,5.8,8.2,1.5,0.7,0.4,1.4,1.8,11.2,1995
A.C. Green,1996,2563,A.C. Green,SF,32,PHO,82,36,25.8,2.6,5.4,0.484,...,2.0,4.7,6.8,0.9,0.5,0.3,1.0,1.7,7.5,1996
A.C. Green,1998,3725,A.C. Green,PF,34,DAL,82,68,32.3,3.0,6.5,0.453,...,2.7,5.5,8.1,1.5,1.0,0.3,0.8,1.9,7.3,1998
A.C. Green,1999,4283,A.C. Green,PF,35,DAL,50,35,18.5,2.2,5.1,0.422,...,1.6,2.9,4.6,0.5,0.6,0.2,0.4,1.4,4.9,1999
A.C. Green,2000,4811,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,0.447,...,2.0,4.0,5.9,1.0,0.6,0.2,0.6,1.5,5.0,2000
A.C. Green,2001,5330,A.C. Green,PF,37,MIA,82,1,17.2,1.8,4.0,0.444,...,1.3,2.5,3.8,0.5,0.4,0.1,0.5,1.5,4.5,2001


In [8]:
# Drop the multi-indexes that were created with the groupby() function
player_stats_df.index = player_stats_df.index.droplevel()
player_stats_df.index = player_stats_df.index.droplevel()



In [35]:
player_stats_df[player_stats_df['Player'] == 'Greg Anderson']

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
467,Greg Anderson,PF,27,DEN,82,82,34.1,4.7,10.4,0.456,...,4.1,7.4,11.5,1.0,1.1,0.8,2.5,3.2,11.5,1992
1412,Greg Anderson,PF,29,DET,77,47,21.1,2.6,4.8,0.543,...,2.4,5.0,7.4,0.7,0.7,0.9,1.2,3.0,6.4,1994
1911,Greg Anderson,PF,30,ATL,51,0,12.2,1.1,2.0,0.548,...,1.2,2.5,3.7,0.3,0.5,0.6,0.6,2.0,2.9,1995
2381,Greg Anderson,PF,31,SAS,46,7,7.5,0.5,1.0,0.511,...,0.6,1.5,2.2,0.2,0.2,0.5,0.5,1.4,1.2,1996
2948,Greg Anderson,C,32,SAS,82,48,20.2,1.6,3.2,0.496,...,1.9,3.5,5.5,0.4,0.8,0.8,0.9,2.7,3.9,1997
3541,Greg Anderson,C,33,ATL,50,0,8.0,0.7,1.6,0.444,...,0.8,1.6,2.4,0.3,0.4,0.2,0.3,1.7,1.8,1998


In [9]:
# Merge the player stat and MVP voting data into a single dataframe
# Do an outer merge, since the player data has all player data (even those that got 0 mvp votes)
full_player_df = player_stats_df.merge(mvp_voting_df, how='outer',on=['Player','Year'])
full_player_df.head(20)  

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.9,0.7,0.3,1.2,1.4,9.1,1991,,,
1,A.C. Green,PF,28,LAL,82,53,35.4,4.7,9.8,0.476,...,1.4,1.1,0.4,1.4,1.7,13.6,1992,,,
2,A.C. Green,PF,29,LAL,82,55,34.4,4.6,8.6,0.537,...,1.4,1.1,0.5,1.4,1.8,12.8,1993,,,
3,A.C. Green,PF,30,PHO,82,55,34.5,5.7,11.3,0.502,...,1.7,0.9,0.5,1.2,1.7,14.7,1994,,,
4,A.C. Green,SF,31,PHO,82,52,32.8,3.8,7.5,0.504,...,1.5,0.7,0.4,1.4,1.8,11.2,1995,,,
5,A.C. Green,SF,32,PHO,82,36,25.8,2.6,5.4,0.484,...,0.9,0.5,0.3,1.0,1.7,7.5,1996,,,
6,A.C. Green,PF,34,DAL,82,68,32.3,3.0,6.5,0.453,...,1.5,1.0,0.3,0.8,1.9,7.3,1998,,,
7,A.C. Green,PF,35,DAL,50,35,18.5,2.2,5.1,0.422,...,0.5,0.6,0.2,0.4,1.4,4.9,1999,,,
8,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,0.447,...,1.0,0.6,0.2,0.6,1.5,5.0,2000,,,
9,A.C. Green,PF,37,MIA,82,1,17.2,1.8,4.0,0.444,...,0.5,0.4,0.1,0.5,1.5,4.5,2001,,,
