# National Basketball Association's Most Valuable Player Prediction

In this project I performed web-scraping to obtain the data for the last 20 years for MVP prediction. After importing data from basketballreference.com, I cleaned it using pandas and made it ready for analysis. In the analysis part, I used Random Forest Regressor to predict the percentage of vote shares and hence predicted the ranks. I got an average precision of 93% for predicting the top 5 MVP candidates for the year 2022.

### Part 1 : WEB SCRAPING

In [3]:
# For years 2003 - 2022
years = list(range(2003, 2023))
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [None]:
import pandas as pd
import requests

In [4]:
for year in years : 
    url = url_start.format(year)
    data = requests.get(url)
    
    with open("mvps/{}.html".format(year), "w+", encoding = "utf-8") as f:
        f.write(data.text)        

In [5]:
from bs4 import BeautifulSoup

In [7]:
with open("mvps/2008.html", encoding = "utf-8") as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")
soup.find('tr', class_ = "over_header").decompose()
mvp_table = soup.find(id = 'mvp')

In [11]:
dfs = []
for year in years:
    with open("mvps/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "over_header").decompose()
    mvp_table = soup.find(id = 'mvp')
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)

In [12]:
mvp_data = pd.concat(dfs)
mvp_data.to_csv("mvps.csv")

In [14]:
# Importing Players Datasets
players = []
for year in range(2003, 2023):
    path = "C:\\Users\\dhruv\\OneDrive\\Desktop\\nba_data\\{}.xlsx".format(year)
    df = pd.read_excel(path)
    df["Year"] = year
    players.append(df)

player_data = pd.concat(players)

In [15]:
player_data.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional,Year
0,1,Tariq Abdul-Wahad,SG,28.0,DAL,14.0,0.0,14.6,1.9,4.1,...,1.9,2.9,1.5,0.4,0.2,0.5,1.9,4.1,abdulta01,2003
1,2,Shareef Abdur-Rahim,PF,26.0,ATL,81.0,81.0,38.1,7.0,14.6,...,6.2,8.4,3.0,1.1,0.5,2.6,3.0,19.9,abdursh01,2003
2,3,Courtney Alexander,PG,25.0,NOH,66.0,7.0,20.6,2.9,7.7,...,1.2,1.8,1.2,0.5,0.1,1.0,1.9,7.9,alexaco02,2003
3,4,Malik Allen,PF,24.0,MIA,80.0,73.0,29.0,4.2,9.9,...,3.6,5.3,0.7,0.5,1.0,1.6,2.9,9.6,allenma01,2003
4,5,Ray Allen*,SG,27.0,TOT,76.0,75.0,37.9,7.9,17.9,...,3.8,5.0,4.4,1.4,0.2,2.6,2.9,22.5,allenra02,2003


In [16]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [17]:
for year in years :
    url = team_stats_url.format(year)

    data = requests.get(url)
    with open("team/{}.html".format(year), "w+", encoding = "utf-8") as f :
        f.write(data.text)

In [18]:
dfs = []
for year in years :
    with open("team/{}.html".format(year), encoding = "utf-8") as f :
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    team_table = soup.find(id = 'divs_standings_E')
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    team_table = soup.find(id = 'divs_standings_W')
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Western Conference"]
    del team["Western Conference"]
    dfs.append(team)

In [19]:
teams = pd.concat(dfs)
teams.to_csv("teams.csv")

### Part 2 : DATA CLEANING

In [21]:
mvps = pd.read_csv("mvps.csv")

In [22]:
# Taking necessary columns from "mvps" dataset.
mvps = mvps[["Player", "Year", "Pts Won", "Pts Max","Share"]]
mvps.head()

Unnamed: 0,Player,Year,Pts Won,Pts Max,Share
0,Tim Duncan,2003,962.0,1190,0.808
1,Kevin Garnett,2003,871.0,1190,0.732
2,Kobe Bryant,2003,496.0,1190,0.417
3,Tracy McGrady,2003,427.0,1190,0.359
4,Shaquille O'Neal,2003,126.0,1190,0.106


In [23]:
players = player_data
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional,Year
0,1,Tariq Abdul-Wahad,SG,28.0,DAL,14.0,0.0,14.6,1.9,4.1,...,1.9,2.9,1.5,0.4,0.2,0.5,1.9,4.1,abdulta01,2003
1,2,Shareef Abdur-Rahim,PF,26.0,ATL,81.0,81.0,38.1,7.0,14.6,...,6.2,8.4,3.0,1.1,0.5,2.6,3.0,19.9,abdursh01,2003
2,3,Courtney Alexander,PG,25.0,NOH,66.0,7.0,20.6,2.9,7.7,...,1.2,1.8,1.2,0.5,0.1,1.0,1.9,7.9,alexaco02,2003
3,4,Malik Allen,PF,24.0,MIA,80.0,73.0,29.0,4.2,9.9,...,3.6,5.3,0.7,0.5,1.0,1.6,2.9,9.6,allenma01,2003
4,5,Ray Allen*,SG,27.0,TOT,76.0,75.0,37.9,7.9,17.9,...,3.8,5.0,4.4,1.4,0.2,2.6,2.9,22.5,allenra02,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,601,Thaddeus Young,PF,33.0,TOR,26.0,0.0,18.3,2.6,5.5,...,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,youngth01,2022
808,602,Trae Young,PG,23.0,ATL,76.0,76.0,34.9,9.4,20.3,...,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,youngtr01,2022
809,603,Omer Yurtseven,C,23.0,MIA,56.0,12.0,12.6,2.3,4.4,...,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,yurtsom01,2022
810,604,Cody Zeller,C,29.0,POR,27.0,0.0,13.1,1.9,3.3,...,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,zelleco01,2022


In [24]:
del players["Rk"]
del players["Player-additional"]

In [25]:
players.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Tariq Abdul-Wahad,SG,28.0,DAL,14.0,0.0,14.6,1.9,4.1,0.466,...,1.0,1.9,2.9,1.5,0.4,0.2,0.5,1.9,4.1,2003
1,Shareef Abdur-Rahim,PF,26.0,ATL,81.0,81.0,38.1,7.0,14.6,0.478,...,2.2,6.2,8.4,3.0,1.1,0.5,2.6,3.0,19.9,2003
2,Courtney Alexander,PG,25.0,NOH,66.0,7.0,20.6,2.9,7.7,0.382,...,0.6,1.2,1.8,1.2,0.5,0.1,1.0,1.9,7.9,2003
3,Malik Allen,PF,24.0,MIA,80.0,73.0,29.0,4.2,9.9,0.424,...,1.7,3.6,5.3,0.7,0.5,1.0,1.6,2.9,9.6,2003
4,Ray Allen*,SG,27.0,TOT,76.0,75.0,37.9,7.9,17.9,0.439,...,1.2,3.8,5.0,4.4,1.4,0.2,2.6,2.9,22.5,2003


In [26]:
players["Player"].head(50)

0         Tariq Abdul-Wahad
1       Shareef Abdur-Rahim
2        Courtney Alexander
3               Malik Allen
4                Ray Allen*
5                Ray Allen*
6                Ray Allen*
7              Rafer Alston
8              John Amaechi
9            Chris Andersen
10           Derek Anderson
11           Kenny Anderson
12           Kenny Anderson
13           Kenny Anderson
14         Shandon Anderson
15         Robert Archibald
16           Gilbert Arenas
17        Brandon Armstrong
18        Darrell Armstrong
19            Carlos Arroyo
20            Chucky Atkins
21            Stacey Augmon
22          Dalibor Bagarić
23                Vin Baker
24              Brent Barry
25                Jon Barry
26             Maceo Baston
27            Mengke Bateer
28             Mike Batiste
29              Tony Battie
30            Shane Battier
31             Lonny Baxter
32                Raja Bell
33          Jonathan Bender
34           Corey Benjamin
35              Trav

In [27]:
players["Player"] = players["Player"].str.replace("*", "", regex = False)

In [28]:
players["Player"].head(50)

0         Tariq Abdul-Wahad
1       Shareef Abdur-Rahim
2        Courtney Alexander
3               Malik Allen
4                 Ray Allen
5                 Ray Allen
6                 Ray Allen
7              Rafer Alston
8              John Amaechi
9            Chris Andersen
10           Derek Anderson
11           Kenny Anderson
12           Kenny Anderson
13           Kenny Anderson
14         Shandon Anderson
15         Robert Archibald
16           Gilbert Arenas
17        Brandon Armstrong
18        Darrell Armstrong
19            Carlos Arroyo
20            Chucky Atkins
21            Stacey Augmon
22          Dalibor Bagarić
23                Vin Baker
24              Brent Barry
25                Jon Barry
26             Maceo Baston
27            Mengke Bateer
28             Mike Batiste
29              Tony Battie
30            Shane Battier
31             Lonny Baxter
32                Raja Bell
33          Jonathan Bender
34           Corey Benjamin
35              Trav

In [29]:
players.groupby(["Player", "Year"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000165B3A3EB10>

In [30]:
def single_row(df) :
    if df.shape[0] == 1 :
        return df
    else :
        row = df[df["Tm"] == "TOT"]
        row["Tm"] = df.iloc[-1, :]["Tm"]
        return row
    
players = players.groupby(["Player", "Year"]).apply(single_row)

In [31]:
players.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
A.J. Guyton,2003,166,A.J. Guyton,PG,24.0,GSW,2.0,0.0,4.5,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.5,0.0,0.5,0.0,0.0,2003
A.J. Hammons,2017,209,A.J. Hammons,C,24.0,DAL,22.0,0.0,7.4,0.8,1.9,0.405,...,0.4,1.3,1.6,0.2,0.0,0.6,0.5,1.0,2.2,2017
A.J. Price,2010,435,A.J. Price,PG,23.0,IND,56.0,2.0,15.4,2.6,6.3,0.41,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,0.9,7.3,2010
A.J. Price,2011,465,A.J. Price,PG,24.0,IND,50.0,0.0,15.9,2.3,6.4,0.356,...,0.3,1.1,1.4,2.2,0.6,0.0,1.1,1.2,6.5,2011
A.J. Price,2012,402,A.J. Price,PG,25.0,IND,44.0,1.0,12.9,1.3,4.0,0.339,...,0.3,1.1,1.4,2.0,0.5,0.0,0.7,0.7,3.9,2012


In [32]:
players.index = players.index.droplevel()
players.index = players.index.droplevel()

In [34]:
# Combining MVP and Players data
merged = players.merge(mvps, how = "outer", on = ["Player", "Year"])

In [35]:
merged.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share
0,A.J. Guyton,PG,24.0,GSW,2.0,0.0,4.5,0.0,2.0,0.0,...,1.0,0.5,0.0,0.5,0.0,0.0,2003,,,
1,A.J. Hammons,C,24.0,DAL,22.0,0.0,7.4,0.8,1.9,0.405,...,0.2,0.0,0.6,0.5,1.0,2.2,2017,,,
2,A.J. Price,PG,23.0,IND,56.0,2.0,15.4,2.6,6.3,0.41,...,1.9,0.6,0.1,1.1,0.9,7.3,2010,,,
3,A.J. Price,PG,24.0,IND,50.0,0.0,15.9,2.3,6.4,0.356,...,2.2,0.6,0.0,1.1,1.2,6.5,2011,,,
4,A.J. Price,PG,25.0,IND,44.0,1.0,12.9,1.3,4.0,0.339,...,2.0,0.5,0.0,0.7,0.7,3.9,2012,,,
5,A.J. Price,PG,26.0,WAS,57.0,22.0,22.4,2.8,7.2,0.39,...,3.6,0.6,0.1,1.1,1.3,7.7,2013,,,
6,A.J. Price,SG,27.0,MIN,28.0,0.0,3.5,0.7,1.6,0.413,...,0.5,0.0,0.0,0.3,0.2,1.6,2014,,,
7,A.J. Price,PG,28.0,PHO,26.0,0.0,12.5,2.0,5.3,0.372,...,1.8,0.3,0.0,0.5,0.6,5.1,2015,,,
8,Aaron Brooks,PG,23.0,HOU,51.0,0.0,11.9,1.8,4.4,0.413,...,1.7,0.3,0.1,0.9,1.4,5.2,2008,,,
9,Aaron Brooks,PG,24.0,HOU,80.0,35.0,25.0,4.0,9.8,0.404,...,3.0,0.6,0.1,1.6,1.9,11.2,2009,,,


In [36]:
merged[["Pts Won","Pts Max","Share"]] = merged[["Pts Won","Pts Max","Share"]].fillna(0)

In [37]:
merged

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share
0,A.J. Guyton,PG,24.0,GSW,2.0,0.0,4.5,0.0,2.0,0.000,...,1.0,0.5,0.0,0.5,0.0,0.0,2003,0.0,0.0,0.0
1,A.J. Hammons,C,24.0,DAL,22.0,0.0,7.4,0.8,1.9,0.405,...,0.2,0.0,0.6,0.5,1.0,2.2,2017,0.0,0.0,0.0
2,A.J. Price,PG,23.0,IND,56.0,2.0,15.4,2.6,6.3,0.410,...,1.9,0.6,0.1,1.1,0.9,7.3,2010,0.0,0.0,0.0
3,A.J. Price,PG,24.0,IND,50.0,0.0,15.9,2.3,6.4,0.356,...,2.2,0.6,0.0,1.1,1.2,6.5,2011,0.0,0.0,0.0
4,A.J. Price,PG,25.0,IND,44.0,1.0,12.9,1.3,4.0,0.339,...,2.0,0.5,0.0,0.7,0.7,3.9,2012,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9655,Žarko Čabarkapa,PF,24.0,GSW,61.0,0.0,8.3,1.1,2.9,0.385,...,0.3,0.2,0.1,0.6,1.4,3.3,2006,0.0,0.0,0.0
9656,Željko Rebrača,C,30.0,DET,30.0,12.0,16.3,2.7,4.8,0.552,...,0.3,0.2,0.6,1.0,2.6,6.6,2003,0.0,0.0,0.0
9657,Željko Rebrača,C,31.0,ATL,24.0,2.0,11.4,1.4,3.2,0.442,...,0.3,0.2,0.5,0.7,2.2,3.8,2004,0.0,0.0,0.0
9658,Željko Rebrača,C,32.0,LAC,58.0,2.0,16.0,2.3,4.0,0.568,...,0.4,0.2,0.7,0.8,2.2,5.8,2005,0.0,0.0,0.0


In [38]:
teams = pd.read_csv("teams.csv")

In [39]:
teams = teams[~teams["W"].str.contains("Division")]

In [40]:
del teams["Unnamed: 0"]
teams.head(30)

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,49,33,0.598,—,95.4,90.1,4.42,2003,New Jersey Nets*
1,48,34,0.585,1.0,96.8,94.5,1.76,2003,Philadelphia 76ers*
2,44,38,0.537,5.0,92.7,93.1,-0.75,2003,Boston Celtics*
3,42,40,0.512,7.0,98.5,98.4,-0.39,2003,Orlando Magic*
4,37,45,0.451,12.0,91.5,92.5,-1.47,2003,Washington Wizards
5,37,45,0.451,12.0,95.9,97.2,-1.61,2003,New York Knicks
6,25,57,0.305,24.0,85.6,90.6,-5.13,2003,Miami Heat
8,50,32,0.61,—,91.4,87.7,2.97,2003,Detroit Pistons*
9,48,34,0.585,2.0,96.8,93.3,2.79,2003,Indiana Pacers*
10,47,35,0.573,3.0,93.9,91.8,1.52,2003,New Orleans Hornets*


In [41]:
teams["Team"] = teams["Team"].str.replace("*", "", regex = False)

In [42]:
teams.head(10)

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,49,33,0.598,—,95.4,90.1,4.42,2003,New Jersey Nets
1,48,34,0.585,1.0,96.8,94.5,1.76,2003,Philadelphia 76ers
2,44,38,0.537,5.0,92.7,93.1,-0.75,2003,Boston Celtics
3,42,40,0.512,7.0,98.5,98.4,-0.39,2003,Orlando Magic
4,37,45,0.451,12.0,91.5,92.5,-1.47,2003,Washington Wizards
5,37,45,0.451,12.0,95.9,97.2,-1.61,2003,New York Knicks
6,25,57,0.305,24.0,85.6,90.6,-5.13,2003,Miami Heat
8,50,32,0.61,—,91.4,87.7,2.97,2003,Detroit Pistons
9,48,34,0.585,2.0,96.8,93.3,2.79,2003,Indiana Pacers
10,47,35,0.573,3.0,93.9,91.8,1.52,2003,New Orleans Hornets


In [43]:
nickn = {}

with open("C:\\Users\\dhruv\\Downloads\\nicknames.txt") as f :
    lines = f.readlines()
    for line in lines[1:] :
        abbr, name = line.replace("\n", "").split(",")
        nickn[abbr] = name

In [44]:
nickn

{'ATL': 'Atlanta Hawks',
 'BRK': 'Brooklyn Nets',
 'BKN': 'Brooklyn Nets',
 'BOS': 'Boston Celtics',
 'CHA': 'Charlotte Bobcats',
 'CHH': 'Charlotte Hornets',
 'CHO': 'Charlotte Hornets',
 'CHI': 'Chicago Bulls',
 'CLE': 'Cleveland Cavaliers',
 'DAL': 'Dallas Mavericks',
 'DEN': 'Denver Nuggets',
 'DET': 'Detroit Pistons',
 'GSW': 'Golden State Warriors',
 'HOU': 'Houston Rockets',
 'IND': 'Indiana Pacers',
 'LAC': 'Los Angeles Clippers',
 'LAL': 'Los Angeles Lakers',
 'MEM': 'Memphis Grizzlies',
 'MIA': 'Miami Heat',
 'MIL': 'Milwaukee Bucks',
 'MIN': 'Minnesota Timberwolves',
 'NJN': 'New Jersey Nets',
 'NOH': 'New Orleans Hornets',
 'NOP': 'New Orleans Pelicans',
 'NOK': 'New Orleans/Oklahoma City Hornets',
 'NYK': 'New York Knicks',
 'OKC': 'Oklahoma City Thunder',
 'ORL': 'Orlando Magic',
 'PHI': 'Philadelphia 76ers',
 'PHX': 'Phoenix Suns',
 'PHO': 'Phoenix Suns',
 'POR': 'Portland Trail Blazers',
 'SEA': 'Seattle SuperSonics',
 'SAC': 'Sacramento Kings',
 'SAS': 'San Antonio Spu

In [45]:
merged["Team"] = merged["Tm"].map(nickn)

In [46]:
merged.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team
0,A.J. Guyton,PG,24.0,GSW,2.0,0.0,4.5,0.0,2.0,0.0,...,0.5,0.0,0.5,0.0,0.0,2003,0.0,0.0,0.0,Golden State Warriors
1,A.J. Hammons,C,24.0,DAL,22.0,0.0,7.4,0.8,1.9,0.405,...,0.0,0.6,0.5,1.0,2.2,2017,0.0,0.0,0.0,Dallas Mavericks
2,A.J. Price,PG,23.0,IND,56.0,2.0,15.4,2.6,6.3,0.41,...,0.6,0.1,1.1,0.9,7.3,2010,0.0,0.0,0.0,Indiana Pacers
3,A.J. Price,PG,24.0,IND,50.0,0.0,15.9,2.3,6.4,0.356,...,0.6,0.0,1.1,1.2,6.5,2011,0.0,0.0,0.0,Indiana Pacers
4,A.J. Price,PG,25.0,IND,44.0,1.0,12.9,1.3,4.0,0.339,...,0.5,0.0,0.7,0.7,3.9,2012,0.0,0.0,0.0,Indiana Pacers
5,A.J. Price,PG,26.0,WAS,57.0,22.0,22.4,2.8,7.2,0.39,...,0.6,0.1,1.1,1.3,7.7,2013,0.0,0.0,0.0,Washington Wizards
6,A.J. Price,SG,27.0,MIN,28.0,0.0,3.5,0.7,1.6,0.413,...,0.0,0.0,0.3,0.2,1.6,2014,0.0,0.0,0.0,Minnesota Timberwolves
7,A.J. Price,PG,28.0,PHO,26.0,0.0,12.5,2.0,5.3,0.372,...,0.3,0.0,0.5,0.6,5.1,2015,0.0,0.0,0.0,Phoenix Suns
8,Aaron Brooks,PG,23.0,HOU,51.0,0.0,11.9,1.8,4.4,0.413,...,0.3,0.1,0.9,1.4,5.2,2008,0.0,0.0,0.0,Houston Rockets
9,Aaron Brooks,PG,24.0,HOU,80.0,35.0,25.0,4.0,9.8,0.404,...,0.6,0.1,1.6,1.9,11.2,2009,0.0,0.0,0.0,Houston Rockets


In [47]:
stats = merged.merge(teams, how = "outer", on = ["Team", "Year"])

In [48]:
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.J. Guyton,PG,24.0,GSW,2.0,0.0,4.5,0.0,2.0,0.000,...,0.0,0.0,Golden State Warriors,38,44,.463,21.0,102.4,103.6,-0.60
1,Adonal Foyle,C,27.0,GSW,82.0,0.0,21.8,2.3,4.2,0.536,...,0.0,0.0,Golden State Warriors,38,44,.463,21.0,102.4,103.6,-0.60
2,Antawn Jamison,SF,26.0,GSW,82.0,82.0,39.3,8.4,17.9,0.470,...,0.0,0.0,Golden State Warriors,38,44,.463,21.0,102.4,103.6,-0.60
3,Bob Sura,SG,29.0,GSW,55.0,0.0,20.5,2.5,6.0,0.412,...,0.0,0.0,Golden State Warriors,38,44,.463,21.0,102.4,103.6,-0.60
4,Chris Mills,PF,33.0,GSW,21.0,0.0,12.5,1.9,5.0,0.368,...,0.0,0.0,Golden State Warriors,38,44,.463,21.0,102.4,103.6,-0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9655,Spencer Hawes,PF,28.0,MIL,54.0,1.0,14.8,2.5,5.1,0.484,...,0.0,0.0,Milwaukee Bucks,42,40,.512,9.0,103.6,103.8,-0.45
9656,Steve Novak,PF,33.0,MIL,8.0,0.0,2.8,0.3,0.9,0.286,...,0.0,0.0,Milwaukee Bucks,42,40,.512,9.0,103.6,103.8,-0.45
9657,Terrence Jones,PF,25.0,MIL,54.0,12.0,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42,40,.512,9.0,103.6,103.8,-0.45
9658,Thon Maker,C,19.0,MIL,57.0,34.0,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42,40,.512,9.0,103.6,103.8,-0.45


In [49]:
stats = stats.apply(pd.to_numeric, errors = "ignore")

In [50]:
stats.dtypes

Player      object
Pos         object
Age        float64
Tm          object
G          float64
GS         float64
MP         float64
FG         float64
FGA        float64
FG%        float64
3P         float64
3PA        float64
3P%        float64
2P         float64
2PA        float64
2P%        float64
eFG%       float64
FT         float64
FTA        float64
FT%        float64
ORB        float64
DRB        float64
TRB        float64
AST        float64
STL        float64
BLK        float64
TOV        float64
PF         float64
PTS        float64
Year         int64
Pts Won    float64
Pts Max    float64
Share      float64
Team        object
W            int64
L            int64
W/L%       float64
GB          object
PS/G       float64
PA/G       float64
SRS        float64
dtype: object

In [51]:
stats["GB"].unique()

array(['21.0', '28.0', '29.0', '25.0', '8.0', '37.0', '19.0', '1.0',
       '13.0', '17.0', '23.0', '3.0', '15.0', '9.0', '2.0', '35.0',
       '20.0', '—', '11.0', '5.0', '26.0', '18.0', '16.0', '41.0', '12.0',
       '14.0', '34.0', '4.0', '1.5', '43.0', '30.0', '18.5', '31.0',
       '7.0', '46.0', '10.0', '6.0', '22.0', '10.5', '32.0', '38.0',
       '27.0', '33.0', '50.0', '21.5', '45.0', '24.0', '22.5', '25.5',
       '36.0', '3.5', '40.0', '20.5', '48.0', '11.5', '56.0', '2.5',
       '12.5', '39.0', '32.5', '4.5'], dtype=object)

In [52]:
stats["GB"] = stats["GB"].str.replace("—", "0")

In [53]:
stats["GB"].unique()

array(['21.0', '28.0', '29.0', '25.0', '8.0', '37.0', '19.0', '1.0',
       '13.0', '17.0', '23.0', '3.0', '15.0', '9.0', '2.0', '35.0',
       '20.0', '0', '11.0', '5.0', '26.0', '18.0', '16.0', '41.0', '12.0',
       '14.0', '34.0', '4.0', '1.5', '43.0', '30.0', '18.5', '31.0',
       '7.0', '46.0', '10.0', '6.0', '22.0', '10.5', '32.0', '38.0',
       '27.0', '33.0', '50.0', '21.5', '45.0', '24.0', '22.5', '25.5',
       '36.0', '3.5', '40.0', '20.5', '48.0', '11.5', '56.0', '2.5',
       '12.5', '39.0', '32.5', '4.5'], dtype=object)

In [54]:
stats["GB"] = pd.to_numeric(stats["GB"])

In [55]:
stats.dtypes

Player      object
Pos         object
Age        float64
Tm          object
G          float64
GS         float64
MP         float64
FG         float64
FGA        float64
FG%        float64
3P         float64
3PA        float64
3P%        float64
2P         float64
2PA        float64
2P%        float64
eFG%       float64
FT         float64
FTA        float64
FT%        float64
ORB        float64
DRB        float64
TRB        float64
AST        float64
STL        float64
BLK        float64
TOV        float64
PF         float64
PTS        float64
Year         int64
Pts Won    float64
Pts Max    float64
Share      float64
Team        object
W            int64
L            int64
W/L%       float64
GB         float64
PS/G       float64
PA/G       float64
SRS        float64
dtype: object

In [56]:
stats.to_csv("player_mvp_stats.csv")

### Part 3 : Analysis and Prediction

In [59]:
import pandas as pd
stats = pd.read_csv("player_mvp_stats.csv")
del stats['Unnamed: 0']

In [60]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.J. Guyton,PG,24.0,GSW,2.0,0.0,4.5,0.0,2.0,0.0,...,0.0,0.0,Golden State Warriors,38,44,0.463,21.0,102.4,103.6,-0.6
1,Adonal Foyle,C,27.0,GSW,82.0,0.0,21.8,2.3,4.2,0.536,...,0.0,0.0,Golden State Warriors,38,44,0.463,21.0,102.4,103.6,-0.6
2,Antawn Jamison,SF,26.0,GSW,82.0,82.0,39.3,8.4,17.9,0.47,...,0.0,0.0,Golden State Warriors,38,44,0.463,21.0,102.4,103.6,-0.6
3,Bob Sura,SG,29.0,GSW,55.0,0.0,20.5,2.5,6.0,0.412,...,0.0,0.0,Golden State Warriors,38,44,0.463,21.0,102.4,103.6,-0.6
4,Chris Mills,PF,33.0,GSW,21.0,0.0,12.5,1.9,5.0,0.368,...,0.0,0.0,Golden State Warriors,38,44,0.463,21.0,102.4,103.6,-0.6


In [61]:
# Null values per column
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          39
3P            0
3PA           0
3P%        1229
2P            0
2PA           0
2P%          77
eFG%         39
FT            0
FTA           0
FT%         377
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [72]:
stats = stats.fillna(0)

In [73]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [74]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [75]:
# Splitting data into training and test Set
train = stats[stats['Year'] < 2022]
test = stats[stats['Year'] == 2022]

In [325]:
from sklearn.ensemble import RandomForestRegressor
reg_model = RandomForestRegressor(n_estimators = 11, random_state = 1, min_samples_split = 5)

In [326]:
reg_model.fit(train[predictors], train['Share'])

In [327]:
reg_pred = reg_model.predict(test[predictors])
reg_pred = pd.DataFrame(reg_pred, columns = ['reg_pred'], index = test.index)

In [330]:
combination = pd.concat([test[['Player', 'Share']], reg_pred], axis = 1 )
combination.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,reg_pred
409,Nikola Jokić,0.875,0.440462
583,Joel Embiid,0.706,0.533421
7527,Giannis Antetokounmpo,0.595,0.559008
653,Devin Booker,0.216,0.601574
7417,Luka Dončić,0.146,0.356455
802,Jayson Tatum,0.043,0.096195
8014,Ja Morant,0.01,0.504277
4554,Stephen Curry,0.004,0.045808
651,Chris Paul,0.002,0.236846
5732,LeBron James,0.001,0.021327


In [331]:
from sklearn.metrics import mean_squared_error
mean_squared_error(combination['Share'], combination['reg_pred'])

0.001427149843544567

In [332]:
combination = combination.sort_values("Share", ascending = False)
combination["Rk"] = list(range(1, combination.shape[0] + 1))

In [333]:
combination.head(10)

Unnamed: 0,Player,Share,reg_pred,Rk
409,Nikola Jokić,0.875,0.440462,1
583,Joel Embiid,0.706,0.533421,2
7527,Giannis Antetokounmpo,0.595,0.559008,3
653,Devin Booker,0.216,0.601574,4
7417,Luka Dončić,0.146,0.356455,5
802,Jayson Tatum,0.043,0.096195,6
8014,Ja Morant,0.01,0.504277,7
4554,Stephen Curry,0.004,0.045808,8
651,Chris Paul,0.002,0.236846,9
5732,LeBron James,0.001,0.021327,10


In [334]:
combination = combination.sort_values("reg_pred", ascending = False)
combination["Prediction Rk"] = list(range(1, combination.shape[0] + 1))

In [335]:
combination.head(10)

Unnamed: 0,Player,Share,reg_pred,Rk,Prediction Rk
653,Devin Booker,0.216,0.601574,4,1
7527,Giannis Antetokounmpo,0.595,0.559008,3,2
583,Joel Embiid,0.706,0.533421,2,3
8014,Ja Morant,0.01,0.504277,7,4
409,Nikola Jokić,0.875,0.440462,1,5
7417,Luka Dončić,0.146,0.356455,5,6
4341,Kevin Durant,0.001,0.299198,12,7
651,Chris Paul,0.002,0.236846,9,8
7655,Trae Young,0.0,0.131381,289,9
2713,DeMar DeRozan,0.001,0.121386,11,10


In [336]:
combination.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,reg_pred,Rk,Prediction Rk
409,Nikola Jokić,0.875,0.440462,1,5
583,Joel Embiid,0.706,0.533421,2,3
7527,Giannis Antetokounmpo,0.595,0.559008,3,2
653,Devin Booker,0.216,0.601574,4,1
7417,Luka Dončić,0.146,0.356455,5,6
802,Jayson Tatum,0.043,0.096195,6,12
8014,Ja Morant,0.01,0.504277,7,4
4554,Stephen Curry,0.004,0.045808,8,14
651,Chris Paul,0.002,0.236846,9,8
2713,DeMar DeRozan,0.001,0.121386,11,10


In [337]:
def ap(combination) :
    actual = combination.sort_values("Share", ascending = False).head()
    pred = combination.sort_values("reg_pred", ascending = False)
    ps = []
    found = 0
    seen = 1
    for index, row in pred.iterrows() : 
        if row["Player"] in actual["Player"].values :
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps)/len(ps)

In [338]:
ap(combination)

0.9266666666666665

In [None]:
# We got average precision of 93% to predict the top 5 MVP candidates in 2022.