# National Basketball Association's Most Valuable Player Prediction

In this project I performed data wrangling to obtain the data for the last 20 years for MVP prediction. After importing data from basketballreference.com, I cleaned it using pandas and made it ready for analysis. In the analysis part, I used Random Forest Regressor to predict the percentage of vote shares and hence predicted the ranks. I got an average precision of 93% for predicting the top 5 MVP candidates for the year 2022.

### Part 1 : WEB SCRAPING

In [1]:
# For years 2003 - 2022
years = list(range(2003, 2023))
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [2]:
import pandas as pd
import requests

In [3]:
for year in years : 
    url = url_start.format(year)
    data = requests.get(url)
    
    with open("mvps/{}.html".format(year), "w+", encoding = "utf-8") as f:
        f.write(data.text)        

In [4]:
from bs4 import BeautifulSoup

In [5]:
with open("mvps/2008.html", encoding = "utf-8") as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")
soup.find('tr', class_ = "over_header").decompose()
mvp_table = soup.find(id = 'mvp')

In [6]:
dfs = []
for year in years:
    with open("mvps/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "over_header").decompose()
    mvp_table = soup.find(id = 'mvp')
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)

In [7]:
mvp_data = pd.concat(dfs)
mvp_data.to_csv("mvps.csv")

In [8]:
# Importing Players Datasets
players = []
for year in range(2003, 2023):
    path = "C:\\Users\\dhruv\\OneDrive\\Desktop\\nba_data\\{}.xlsx".format(year)
    df = pd.read_excel(path)
    df["Year"] = year
    players.append(df)

player_data = pd.concat(players)

In [9]:
player_data.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional,Year
0,1,Tariq Abdul-Wahad,SG,28.0,DAL,14.0,0.0,14.6,1.9,4.1,...,1.9,2.9,1.5,0.4,0.2,0.5,1.9,4.1,abdulta01,2003
1,2,Shareef Abdur-Rahim,PF,26.0,ATL,81.0,81.0,38.1,7.0,14.6,...,6.2,8.4,3.0,1.1,0.5,2.6,3.0,19.9,abdursh01,2003
2,3,Courtney Alexander,PG,25.0,NOH,66.0,7.0,20.6,2.9,7.7,...,1.2,1.8,1.2,0.5,0.1,1.0,1.9,7.9,alexaco02,2003
3,4,Malik Allen,PF,24.0,MIA,80.0,73.0,29.0,4.2,9.9,...,3.6,5.3,0.7,0.5,1.0,1.6,2.9,9.6,allenma01,2003
4,5,Ray Allen*,SG,27.0,TOT,76.0,75.0,37.9,7.9,17.9,...,3.8,5.0,4.4,1.4,0.2,2.6,2.9,22.5,allenra02,2003


In [10]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [11]:
for year in years :
    url = team_stats_url.format(year)

    data = requests.get(url)
    with open("team/{}.html".format(year), "w+", encoding = "utf-8") as f :
        f.write(data.text)

In [12]:
dfs = []
for year in years :
    with open("team/{}.html".format(year), encoding = "utf-8") as f :
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    team_table = soup.find(id = 'divs_standings_E')
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_ = "thead").decompose()
    team_table = soup.find(id = 'divs_standings_W')
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Western Conference"]
    del team["Western Conference"]
    dfs.append(team)

AttributeError: 'NoneType' object has no attribute 'decompose'

In [None]:
teams = pd.concat(dfs)
teams.to_csv("teams.csv")

### Part 2 : DATA CLEANING

In [None]:
mvps = pd.read_csv("mvps.csv")

In [None]:
# Taking necessary columns from "mvps" dataset.
mvps = mvps[["Player", "Year", "Pts Won", "Pts Max","Share"]]
mvps.head()

In [None]:
players = player_data
players

In [None]:
del players["Rk"]
del players["Player-additional"]

In [None]:
players.head()

In [None]:
players["Player"].head(50)

In [None]:
players["Player"] = players["Player"].str.replace("*", "", regex = False)

In [None]:
players["Player"].head(50)

In [None]:
players.groupby(["Player", "Year"])

In [None]:
# Players who played for multiple teams in a single season
def single_row(df) :
    if df.shape[0] == 1 :
        return df
    else :
        row = df[df["Tm"] == "TOT"]
        row["Tm"] = df.iloc[-1, :]["Tm"]
        return row
    
players = players.groupby(["Player", "Year"]).apply(single_row)

In [None]:
players.head()

In [None]:
players.index = players.index.droplevel()
players.index = players.index.droplevel()

In [None]:
# Combining MVP and Players data
merged = players.merge(mvps, how = "outer", on = ["Player", "Year"])

In [None]:
merged.head(10)

In [None]:
merged[["Pts Won","Pts Max","Share"]] = merged[["Pts Won","Pts Max","Share"]].fillna(0)

In [None]:
merged

In [None]:
teams = pd.read_csv("teams.csv")

In [None]:
teams = teams[~teams["W"].str.contains("Division")]

In [None]:
del teams["Unnamed: 0"]
teams.head(30)

In [None]:
teams["Team"] = teams["Team"].str.replace("*", "", regex = False)

In [None]:
teams.head(10)

In [None]:
nickn = {}

with open("C:\\Users\\dhruv\\Downloads\\nicknames.txt") as f :
    lines = f.readlines()
    for line in lines[1:] :
        abbr, name = line.replace("\n", "").split(",")
        nickn[abbr] = name

In [None]:
nickn

In [None]:
merged["Team"] = merged["Tm"].map(nickn)

In [None]:
merged.head(10)

In [None]:
stats = merged.merge(teams, how = "outer", on = ["Team", "Year"])

In [None]:
stats

In [None]:
stats = stats.apply(pd.to_numeric, errors = "ignore")

In [None]:
stats.dtypes

In [None]:
stats["GB"].unique()

In [None]:
stats["GB"] = stats["GB"].str.replace("—", "0")

In [None]:
stats["GB"].unique()

In [None]:
stats["GB"] = pd.to_numeric(stats["GB"])

In [None]:
stats.dtypes

In [None]:
stats.to_csv("player_mvp_stats.csv")

### Part 3 : Analysis and Prediction

In [None]:
import pandas as pd
stats = pd.read_csv("player_mvp_stats.csv")
del stats['Unnamed: 0']

In [None]:
stats.head()

In [None]:
# Null values per column
pd.isnull(stats).sum()

In [None]:
stats = stats.fillna(0)

In [None]:
stats.columns

In [None]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [None]:
# Splitting data into training and test Set
train = stats[stats['Year'] < 2022]
test = stats[stats['Year'] == 2022]

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg_model = RandomForestRegressor(n_estimators = 11, random_state = 1, min_samples_split = 5)

In [None]:
reg_model.fit(train[predictors], train['Share'])

In [None]:
reg_pred = reg_model.predict(test[predictors])
reg_pred = pd.DataFrame(reg_pred, columns = ['reg_pred'], index = test.index)

In [None]:
combination = pd.concat([test[['Player', 'Share']], reg_pred], axis = 1 )
combination.sort_values("Share", ascending = False).head(10)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(combination['Share'], combination['reg_pred'])

In [None]:
combination = combination.sort_values("Share", ascending = False)
combination["Rk"] = list(range(1, combination.shape[0] + 1))

In [None]:
combination.head(10)

In [None]:
combination = combination.sort_values("reg_pred", ascending = False)
combination["Prediction Rk"] = list(range(1, combination.shape[0] + 1))

In [None]:
combination.head(10)

In [None]:
combination.sort_values("Share", ascending = False).head(10)

In [None]:
def ap(combination) :
    actual = combination.sort_values("Share", ascending = False).head()
    pred = combination.sort_values("reg_pred", ascending = False)
    ps = []
    found = 0
    seen = 1
    for index, row in pred.iterrows() : 
        if row["Player"] in actual["Player"].values :
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps)/len(ps)

In [None]:
ap(combination)

In [None]:
# We got average precision of 93% to predict the top 5 MVP candidates in 2022.

In [None]:
import numpy as np
feature_importances = np.sort(reg_model.feature_importances_)
feature_names = predictors  # Assuming your features are in a DataFrame
sns.barplot(x=feature_importances, y=feature_names)
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance")
plt.show()