In [None]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import os
import shutil
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.compose import make_column_transformer

In [None]:
years = list(range(1985, 2022))

In [None]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [None]:
for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    with open('mvp/{}.html'.format(year), 'w+', encoding = 'utf-8') as f:
        f.write(data.text)

In [None]:
dfs = []

for year in years:
    with open("mvp/{}.html".format(year), encoding ="utf-8") as f:
        page = f.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_ = "over_header").decompose()
    mvp_table = soup.find_all(id='mvp')[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df['Year'] = year
    
    dfs.append(mvp_df)

In [None]:
mvps = pd.concat(dfs)

In [None]:
mvps.head()

In [None]:
mvps.to_csv('mvps.csv')

Let's try doing this with player stats now.

1. The bball-ref player stats table uses javascript, so we will have to account for that.

In [None]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [None]:
 for year in years:
    url = player_stats_url.format(year)
    data = requests.get(url)
    with open('player/{}.html'.format(year), 'w+', encoding = 'utf-8') as f:
        f.write(data.text)

In [None]:
df = []
for year in years:
    
    with open('player/{}.html'.format(year), encoding = 'utf-8') as f:
        page = f.read()

        soup = BeautifulSoup(page, 'html.parser')
        soup.find('tr', class_ = "thead").decompose()
        player_table = soup.find_all(id='per_game_stats')[0]
        player_df = pd.read_html(str(player_table))[0]
        player_df['Year'] = year
        
        df.append(player_df)

In [None]:
player_df.head()

In [None]:
player_df = pd.concat(df)

In [None]:
player_df.to_csv('players.csv')

Scrape team records per year

In [None]:
team_stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'

In [None]:
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)

    with open ('team/{}.html'.format(year), 'w+',encoding = 'utf-8') as f:
        f.write(data.text)

In [None]:
dfs = []

for year in years:
    with open('team/{}.html'.format(year), encoding = 'utf-8') as f:
        page = f.read()

        soup = BeautifulSoup(page, 'html.parser')
        soup.find('tr', class_='thead').decompose()
        team_table = soup.find_all(id = 'divs_standings_E')
        team = pd.read_html(str(team_table))[0]
        team['Year'] = year
        team['Team'] = team['Eastern Conference']
        del team['Eastern Conference']
        dfs.append(team)

        soup = BeautifulSoup(page, 'html.parser')
        soup.find('tr', class_='thead').decompose()
        team_table = soup.find_all(id = 'divs_standings_W')
        team = pd.read_html(str(team_table))[0]
        team['Year'] = year
        team['Team'] = team['Western Conference']
        del team['Western Conference']
        dfs.append(team)

In [None]:
teams = pd.concat(dfs)
teams['GB'] = teams['GB'].str.replace('—',"0",regex=False)

In [None]:
teams = teams[~teams['W'].str.contains('Division')]

In [None]:
teams['Team'] = teams['Team'].str.replace('*',"", regex=False)

In [None]:
teams.to_csv('teams.csv')

Data Cleaning

In [None]:
mvps = pd.read_csv('mvps.csv')

In [None]:
#Only select the cols that we need, since the players table already contains game stats
mvps = mvps[['Player','Year','Pts Won','Pts Max', 'Share',]]

Clean player data and combine with MVP table


In [None]:
players = pd.read_csv('players.csv')
players

In [None]:
# remove unnecessary cols
del players['Unnamed: 0']
del players['Rk']

In [None]:
# remove '*' next to player names for future join
players['Player'] = players['Player'].str.replace('*','', regex=False)

In [None]:
#remove duplicate player rows for players for example who were traded
players.groupby(['Player','Year'])

In [None]:
def single_row(df):
    if df.shape[0] == 1:
        return df
    else:
        row = df[df['Tm'] == 'TOT']
        row['Tm'] = df.iloc[-1,:]['Tm']
        return row
players = players.groupby(['Player','Year']).apply(single_row)

In [None]:
players.index = players.index.droplevel()

In [None]:
players.index = players.index.droplevel()

In [None]:
players[players['Player'] == 'LeBron James']

In [None]:
combined = players.merge(mvps, how ='outer', on = ['Player','Year'])

In [None]:
combined.to_csv('combined_plmp.csv')

In [None]:
players

In [None]:
combined[combined['Pts Won'] > 0]

Address NaN values

In [None]:
combined = combined.fillna(0)

In [None]:
combined = combined.apply(pd.to_numeric, errors='ignore')

In [None]:
combined.dtypes

In [None]:
teams = pd.read_csv('teams.csv')

Team names are not consistent across tables, let's address that


In [None]:
teams['Team'].unique()

In [None]:
combined['Tm'] = combined['Tm'].str.replace('CHA','CHB',regex=False)

In [None]:
combined['Tm'].unique()

In [None]:
abbr = {}

with open('team_abbr.csv') as f:
    lines = f.readlines()
    for line in lines[1:]:
        abbrev, name = line.replace('\n',"").split(",")
        abbr[abbrev] = name

In [None]:
abbr

In [None]:
combined['Team'] = combined['Tm'].map(abbr)

In [None]:
stats = combined.merge(teams, how = 'left', on = ['Team','Year'])

In [None]:
stats

In [None]:
del stats['Unnamed: 0']

In [None]:
stats.dtypes

In [None]:
stats = stats.apply(pd.to_numeric, errors='ignore')

In [None]:
stats.dtypes

In [None]:
stats.to_csv('player_mvp_stats.csv')

In [None]:
scoring_leaders = stats[stats['G'] > 70].sort_values('PTS', ascending = False).head(10)

In [None]:
scoring_leaders.plot.bar('Player', 'PTS')

In [None]:
high_scores = stats.groupby('Year').apply(lambda x: x.sort_values('PTS',ascending=False).head(1))

In [None]:
high_scores.plot.bar('Year','PTS')

Check correlation

In [None]:
stats.corr()['Share']

In [None]:
stats.corr()['Share'].plot.bar()

Check for nulls

In [None]:
pd.isnull(stats).sum()

Remove Player, Pts Won, Pts Max, Share because these values are what we are trying to predict, and leaving them in could lead to correlation issues. We will also remove 'Team' and 'Tm', as these columns were only used to join the win/loss data.

In [None]:
stats.columns
stats = stats.drop(['Team','Tm'], axis=1)

Before we actually start evaluating our model, we will have to account for our categorical variable, 'Pos' (player position). We can use One-Hot Encoding in order to create dummy variables and represent each position as a binary vector. For the purposes of simplifying our model, I will replace any multi-positional indicators such as 'PF-C' (Power Forward - Center) to just the first position indicated.

In [None]:
stats['Pos'].unique()

In [None]:
stats['Pos'] = stats['Pos'].replace(r'-\w+', '', regex=True)

In [None]:
transformer = make_column_transformer((OneHotEncoder(), ['Pos']), remainder = 'passthrough')

In [None]:
transformed = transformer.fit_transform(stats)
enc_stats = pd.DataFrame(transformed, columns = transformer.get_feature_names_out())

In [None]:
enc_stats.columns = enc_stats.columns.str.replace(r'^\w+__','',regex=True)

In [None]:
enc_stats = enc_stats.apply(pd.to_numeric, errors='ignore')

In [None]:
enc_stats.dtypes

In [None]:
enc_stats.columns

Select/define features for cross validation and modelling

In [None]:
predictors = ['Pos_C', 'Pos_PF', 'Pos_PG', 'Pos_SF', 'Pos_SG', 'Age', 'G',
       'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%',
       'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [None]:
X_mvp = enc_stats[predictors]

In [None]:
y_mvp = enc_stats[['Share']]

Let's use a random forest model since we have a lot of categorical variables. 400 random trees in our forest, and our model will make a split when there is a minimum of 5 samples at an internal node.

In [None]:
#create scoring functions
regression_score = {'R^2 score' : 'r2',
    'Explained Variance' : 'explained_variance',
    'MSE' : 'neg_mean_squared_error',
    'RMSE' : make_scorer(mean_squared_error, squared=False)}

def cross_validation_score(model, X, y, scoring, CV):
    scores = cross_validate(model, X, y.values.ravel(), scoring=scoring, cv=CV)

    return {metric: (round(np.mean(scores),3),
    [round(np.std(scores),3)]) for metric, scores in scores.items()}

In [None]:
rf = RandomForestRegressor(n_estimators=400, random_state = 1, min_samples_split = 5, criterion = 'squared_error')

In [None]:
rf_scores = cross_validation_score(rf, X_mvp, y_mvp, regression_score, 10)
rf_scores