In [1]:
from fastai.imports import *
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from IPython.display import display
from sklearn import metrics
path = '2019-2020 NBA Player Stats.csv'
valid_path = '2018-2019 NBA Player Stats.csv'

In [2]:
df = pd.read_csv(path,encoding='cp1252')
valid_df = pd.read_csv(valid_path,encoding='cp1252')
df.head()

Unnamed: 0,FULL NAME,TEAM,POS,AGE,GP,MPG,Minutes Percentage,Usage Rate,Turnover Rate,FTA,...,Rebounds per game.,Total Rebound Percentage,Assists per game.,Assist Percentage,Steals per game.,Blocks per game.,Turnovers per game.,Versatility index,Offensive Rating,Defensive Rating
0,Steven Adams,Okc,C,27.07,63,26.7,55.6,17.3,14.4,201,...,9.3,19.2,2.3,13.2,0.81,1.06,1.51,9.2,122.0,101.9
1,Bam Adebayo,Mia,C-F,23.08,72,33.6,69.9,21.2,17.5,382,...,10.2,17.0,5.1,24.2,1.14,1.29,2.82,11.2,116.2,102.2
2,LaMarcus Aldridge,San,F-C,35.07,53,33.1,68.9,23.4,7.8,191,...,7.4,12.0,2.4,11.4,0.68,1.64,1.4,8.4,114.7,109.1
3,Kyle Alexander,Mia,F-C,23.81,2,6.7,14.0,9.9,33.3,0,...,1.5,12.5,0.0,0.0,0.0,0.0,0.5,0.0,96.0,108.8
4,Nickeil Alexander-Walker,Nor,G,21.95,47,12.6,26.2,23.3,16.1,37,...,1.8,7.5,1.9,21.1,0.36,0.17,1.15,8.5,93.2,106.4


In [3]:
def parse_position(df):
    df["C"] = df["POS"].str.contains("C")
    df["G"] = df["POS"].str.contains("G")
    df["F"] = df["POS"].str.contains("F")
    df["Multiple Positions"] = (df["C"] & df["G"]) | (df["C"] & df["F"]) | (df["F"] & df["G"])

def clean_data(df):
    return df[np.logical_and(pd.isna(df).sum(axis = 1) == 0, df["GP"] > 5)]

parse_position(df)
parse_position(valid_df)
df = clean_data(df)
valid_df = clean_data(valid_df)
df.head()

Unnamed: 0,FULL NAME,TEAM,POS,AGE,GP,MPG,Minutes Percentage,Usage Rate,Turnover Rate,FTA,...,Steals per game.,Blocks per game.,Turnovers per game.,Versatility index,Offensive Rating,Defensive Rating,C,G,F,Multiple Positions
0,Steven Adams,Okc,C,27.07,63,26.7,55.6,17.3,14.4,201,...,0.81,1.06,1.51,9.2,122.0,101.9,True,False,False,False
1,Bam Adebayo,Mia,C-F,23.08,72,33.6,69.9,21.2,17.5,382,...,1.14,1.29,2.82,11.2,116.2,102.2,True,False,True,True
2,LaMarcus Aldridge,San,F-C,35.07,53,33.1,68.9,23.4,7.8,191,...,0.68,1.64,1.4,8.4,114.7,109.1,True,False,True,True
4,Nickeil Alexander-Walker,Nor,G,21.95,47,12.6,26.2,23.3,16.1,37,...,0.36,0.17,1.15,8.5,93.2,106.4,False,True,False,False
5,Grayson Allen,Mem,G,24.85,38,18.9,39.4,17.6,10.9,45,...,0.32,0.05,0.87,6.3,113.6,108.9,False,True,False,False


In [4]:
def convert_rate(df):
    df["Steals per minute"] = df["Steals per game."]/df["MPG"]
    df["Blocks per minute"] = df["Blocks per game."]/df["MPG"]
    df["FTA per minute"] = df["FTA"]/df["MPG"]
    df["2PA per minute"] = df["2PA"]/df["MPG"]
    df["3PA per minute"] = df["3PA"]/df["MPG"]
    df["Points per minute"] = df["Points per game."]/df["MPG"]
    df["Assists per minute"] = df["Assists per game."]/df["MPG"]
    return df.drop(columns = ["POS", "TEAM", "Steals per game.", "Blocks per game.",
                       "FTA", "2PA", "Points per game.", "Rebounds per game.",
                       "3PA", "Turnovers per game.", "Assists per game.",
                             "Minutes Percentage", "GP", "Effective Shooting Percentage",
                             "True Shooting Percentage"])
df = convert_rate(df)
valid_df = convert_rate(valid_df)

In [5]:
m = ExtraTreesRegressor(n_jobs = -1, min_samples_leaf = 2, max_features = 0.4)
m.fit(df.drop(columns = ["FULL NAME", "MPG"]), df["MPG"])
print(m.score(df.drop(columns = ["FULL NAME", "MPG"]), df["MPG"]))
m.score(valid_df.drop(columns = ["FULL NAME", "MPG"]), valid_df["MPG"])

0.9537740616218002


0.5048113283305615

In [6]:
comp = pd.DataFrame(data = 
                    {'Prediction': m.predict(valid_df.drop(columns = ["FULL NAME", "MPG"])),
                    'Actual': valid_df["MPG"]})
#comp

Unnamed: 0,Prediction,Actual
0,16.780733,19.0
1,13.206833,12.3
2,17.192667,12.6
3,24.147167,33.4
4,20.278667,23.3
...,...,...
615,32.276000,30.9
616,20.549917,25.4
619,22.893150,18.3
620,21.883833,15.6


In [7]:
imp = pd.DataFrame(data = {'Feature': df.drop(columns = ["FULL NAME", "MPG"]).columns,
                          'Importance': m.feature_importances_})
#imp

Unnamed: 0,Feature,Importance
0,AGE,0.032713
1,Usage Rate,0.068751
2,Turnover Rate,0.025497
3,FT%,0.064431
4,2P%,0.03111
5,3P%,0.043523
6,Total Rebound Percentage,0.020329
7,Assist Percentage,0.033696
8,Versatility index,0.027456
9,Offensive Rating,0.055691
