In [1]:
import pandas as pd
import numpy as np

Let's explore a bit the dataset

In [2]:
mvp_2022 = pd.read_csv("../Data/Raw/MVP2022.csv", header = 1)
mvp_2022 = mvp_2022.drop(columns = mvp_2022.columns[-1])
mvp_2022["Year"] = 2022
mvp_2022.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Nikola Jokić,26,DEN,65.0,875.0,1000,0.875,74,33.5,...,13.8,7.9,1.5,0.9,0.583,0.337,0.81,15.2,0.296,2022
1,2,Joel Embiid,27,PHI,26.0,706.0,1000,0.706,68,33.8,...,11.7,4.2,1.1,1.5,0.499,0.371,0.814,12.0,0.252,2022
2,3,Giannis Antetokounmpo,27,MIL,9.0,595.0,1000,0.595,67,32.9,...,11.6,5.8,1.1,1.4,0.553,0.293,0.722,12.9,0.281,2022
3,4,Devin Booker,25,PHO,0.0,216.0,1000,0.216,68,34.5,...,5.0,4.8,1.1,0.4,0.466,0.383,0.868,7.6,0.156,2022
4,5,Luka Dončić,22,DAL,0.0,146.0,1000,0.146,65,35.4,...,9.1,8.7,1.2,0.6,0.457,0.353,0.744,7.6,0.159,2022


Now we will create some code to clean all the datasets, save them individually, and also concatenate them in one to save it as well

First we create two empty lists to store keys and values, and an empty dictionary where we will link the previous lists.

Then we do a for loop to create key names and extract the datasets.

In [3]:
df_name = []
df_data = []
mvps = {}

for x in range(5):
    df_name.append("mvp_"+str(2018+x))
    df_data.append(pd.read_csv("../Data/Raw/MVP{}.csv".format(2018+x), header = 1))

Now we link the keys yo the values, and also do another for loop to clean the data:

- Drop a useless column
- Create columns with PTS, AST, TRB, STL and BLK per minute, in order to not give the edge to players with high minutes per game.
- A column with all stats combined per minute.
- Create another column with the year of the MVP to differentiate it with the others once we concatenate them all.

In [4]:
mvps = dict(zip(df_name, df_data))
    
for key, x in zip(mvps, range(5)):
    mvps[key] = mvps[key].drop(columns = mvps[key].columns[-1])
    mvps[key]["PTS/M"] = mvps[key]["PTS"]/mvps[key]["MP"]
    mvps[key]["TRB/M"] = mvps[key]["TRB"]/mvps[key]["MP"]
    mvps[key]["AST/M"] = mvps[key]["AST"]/mvps[key]["MP"]
    mvps[key]["STL/M"] = mvps[key]["STL"]/mvps[key]["MP"]
    mvps[key]["BLK/M"] = mvps[key]["BLK"]/mvps[key]["MP"]
    mvps[key]["Stats/M"] = mvps[key]["PTS/M"]+mvps[key]["TRB/M"]+mvps[key]["AST/M"]+mvps[key]["STL/M"]+mvps[key]["BLK/M"]
    mvps[key]["Year"] = (2018+x)
    mvps[key].to_csv("../Data/Cleaned/CleanMVP{}.csv".format(2018+x))

In [5]:
data = pd.concat(mvps.values(), ignore_index = True)
data.to_csv("../Data/Cleaned/CleanMVPs.csv")
data

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,FT%,WS,WS/48,PTS/M,TRB/M,AST/M,STL/M,BLK/M,Stats/M,Year
0,1,James Harden,28,HOU,86.0,965.0,1010,0.955,72,35.4,...,0.858,15.4,0.289,0.858757,0.152542,0.248588,0.050847,0.019774,1.330508,2018
1,2,LeBron James,33,CLE,15.0,738.0,1010,0.731,82,36.9,...,0.731,14.0,0.221,0.745257,0.233062,0.246612,0.037940,0.024390,1.287263,2018
2,3,Anthony Davis,24,NOP,0.0,445.0,1010,0.441,75,36.4,...,0.828,13.7,0.241,0.771978,0.304945,0.063187,0.041209,0.071429,1.252747,2018
3,4,Damian Lillard,27,POR,0.0,207.0,1010,0.205,73,36.6,...,0.916,12.6,0.227,0.734973,0.122951,0.180328,0.030055,0.010929,1.079235,2018
4,5,Russell Westbrook,29,OKC,0.0,76.0,1010,0.075,80,36.4,...,0.737,10.1,0.166,0.697802,0.277473,0.282967,0.049451,0.008242,1.315934,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,8,Stephen Curry,33,GSW,0.0,4.0,1000,0.004,64,34.5,...,0.923,8.0,0.173,0.739130,0.150725,0.182609,0.037681,0.011594,1.121739,2022
60,9,Chris Paul,36,PHO,0.0,2.0,1000,0.002,65,32.9,...,0.837,9.4,0.210,0.446809,0.133739,0.328267,0.057751,0.009119,0.975684,2022
61,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,36.1,...,0.877,8.8,0.154,0.772853,0.144044,0.135734,0.024931,0.008310,1.085873,2022
62,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,37.2,...,0.910,8.4,0.198,0.803763,0.198925,0.172043,0.024194,0.024194,1.223118,2022
