In [1]:
### Basic Imports
import pandas as pd
import numpy as np

### Graphic libraries
import matplotlib.pyplot as plt
import seaborn as sns 

import statistics 


### Importing the relevant ML libraries 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score,roc_curve,auc,recall_score,f1_score,precision_score,classification_report,confusion_matrix,auc

import statsmodels.api as sm

### Some cosmetics add-ons
plt.style.use('fivethirtyeight')
%matplotlib inline

# Load dataset

In [6]:
df = pd.read_csv('nba_2020_clean.csv')

# Get significant features

In [13]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

train_data = df[(df['GAME_PLAYED_x'] <= 41) | (df['GAME_PLAYED_y'] <= 41 )]
test_data = df[(df['GAME_PLAYED_x'] > 41) & (df['GAME_PLAYED_y'] > 41 )]

# Creating our independent and dependent variables
x = train_data[features_list]
y = train_data['PLUS_MINUS_x']

model = sm.OLS(y,x)
results = model.fit()

features_list = []
for i in range(len(x.keys())):
    if results.pvalues[i] <= 0.05:
        features_list.append(model.exog_names[i])

features_list


['DIS_ELO_x', 'DIS_OFFRATE_x']

In [8]:
results.summary()

0,1,2,3
Dep. Variable:,PLUS_MINUS_x,R-squared (uncentered):,0.086
Model:,OLS,Adj. R-squared (uncentered):,0.073
Method:,Least Squares,F-statistic:,6.999
Date:,"Tue, 30 Mar 2021",Prob (F-statistic):,7.59e-09
Time:,23:46:13,Log-Likelihood:,-2462.4
No. Observations:,605,AIC:,4941.0
Df Residuals:,597,BIC:,4976.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
DIS_ELO_x,0.0337,0.012,2.855,0.004,0.011,0.057
HOME_COURT_x,1.4695,0.830,1.770,0.077,-0.161,3.100
DIS_OFFRATE_x,40.0128,12.594,3.177,0.002,15.279,64.746
DIS_DEFRATE_x,-3.8933,2.549,-1.528,0.127,-8.898,1.112
DIS_PTS_x,-0.0845,0.096,-0.882,0.378,-0.273,0.104
DIS_AST_x,0.1900,0.213,0.892,0.373,-0.229,0.609
DIS_OREB_x,-0.2326,0.271,-0.859,0.391,-0.765,0.299
DIS_DREB_x,0.0383,0.220,0.174,0.862,-0.394,0.471

0,1,2,3
Omnibus:,4.919,Durbin-Watson:,1.946
Prob(Omnibus):,0.085,Jarque-Bera (JB):,5.287
Skew:,-0.132,Prob(JB):,0.0711
Kurtosis:,3.374,Cond. No.,1410.0


# Get performance of each model

In [14]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

performance_table = {'f1':[],'acc':[]}

for model_name in models_dict:
    X_train = train_data[features_list]
    X_test = test_data[features_list]
    y_train = train_data['WL_x']
    y_test = test_data['WL_x']
    
    m = models_dict[model_name]

    if model_name == 'Linear Regression':
        y_train = train_data['PLUS_MINUS_x']
    
    m.fit(X_train, y_train)
    predictions = m.predict(X_test)

    if model_name == 'Linear Regression':
        for i, v in enumerate(predictions):
            if v > 0:
                predictions[i] = 1
            else:
                predictions[i] = 0
    
    f1 = round(f1_score(y_test,predictions),4)
    acc = round(accuracy_score(y_test,predictions),4)
    
    performance_table['f1'].append(f1)
    performance_table['acc'].append(acc)

# for k, v in performance_table.items():
#     performance_df = pd.DataFrame(v,index=list(models_dict.keys()))
#     print(performance_df)

performance_df = pd.DataFrame(performance_table, index=list(models_dict.keys()))
performance_df

Unnamed: 0,f1,acc
Linear Regression,0.7733,0.7424
Logistic Regression,0.7297,0.697
Naive Bayes,0.7733,0.7424
Decision Trees,0.5867,0.5303
SVM linear,0.7297,0.697
SVM rbf,0.7561,0.697
Random Forest,0.6486,0.6061


# Add ensemble model prediction column for each row in test data

In [15]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    # 'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}
test_data['Prediction'] = 0
for index, row in test_data.iterrows():
    test_df = row.to_frame().T
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_data[features_list]
        X_test = test_df[features_list]
        y_train = train_data['WL_x']
        
        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_data['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 5)
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

# Get performance of ensemble model

In [17]:
y_test = test_data['WL_x']
f1 = f1_score(y_test,test_data['Prediction'])
acc = accuracy_score(y_test,test_data['Prediction'])
print("f1 score: {}\nacc score: {}".format(round(f1,4),round(acc,4)))

f1 score: 0.7632
acc score: 0.7273


# Ensemble performance - prediction for every game based on all previous games

In [3]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

# predict and test 2nd half of season
test_data = df[(df['GAME_PLAYED_x'] > 41) & (df['GAME_PLAYED_y'] > 41 )]
test_data['Prediction'] = 0

models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    # 'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

for index, row in test_data.iterrows():
    # Creating training dataset for all previous games
    idx = df.index[df['GAME_ID'] == row['GAME_ID']]
    train_df = df.loc[:idx[0]-1]
    test_df = row.to_frame().T

    # Creating our independent and dependent variables
    x = train_df[features_list]
    y = train_df['PLUS_MINUS_x']

    model = sm.OLS(y,x)
    results = model.fit()

    # Extracting significant features
    features_list = []
    for i in range(len(x.keys())):
        if results.pvalues[i] <= 0.05:
            features_list.append(model.exog_names[i])

    # Predicting game outcome
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_df[features_list]
        y_train = train_df['WL_x']
        X_test = test_df[features_list]

        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_df['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 5)

    # Appending predicted outcome to test_data
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

In [16]:
y_test = test_data['WL_x']

f1 = f1_score(y_test,test_data['Prediction'])
accuracy = accuracy_score(y_test,test_data['Prediction'])
precision = precision_score(y_test,test_data['Prediction'])
recall = recall_score(y_test,test_data['Prediction'])

print("Ensemble model")
print("-"*20)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1 score: {}".format(round(accuracy,4),round(precision,4),round(recall,4),round(f1,4)))

Ensemble model
--------------------
accuracy: 0.7273
precision: 0.8286
recall: 0.7073
f1 score: 0.7632


In [5]:
test_data

Unnamed: 0,SEASON_ID_x,TEAM_ID_x,TEAM_ABBREVIATION_x,TEAM_NAME_x,GAME_ID,GAME_DATE_x,MATCHUP_x,WL_x,MIN_x,PTS_x,...,DEFRATE_y,ELO_y,DIS_PTS_y,DIS_AST_y,DIS_OREB_y,DIS_DREB_y,DIS_OFFRATE_y,DIS_DEFRATE_y,DIS_ELO_y,Prediction
592,22020,1610612758,SAC,Sacramento Kings,22000641,2021-03-20,SAC @ PHI,0,240,105,...,1.137078,1595.038811,0.365854,-2.121951,-0.024390,3.707317,-0.009096,0.035032,140.548588,0
596,22020,1610612739,CLE,Cleveland Cavaliers,22000648,2021-03-21,CLE vs. TOR,1,241,116,...,1.161803,1440.647729,8.682927,2.219512,-2.073171,-0.121951,0.071374,0.129877,17.220144,0
598,22020,1610612752,NYK,New York Knicks,22000650,2021-03-21,NYK vs. PHI,0,266,100,...,1.118449,1601.200616,10.619048,2.761905,0.071429,0.476190,0.044262,0.092025,101.449105,0
602,22020,1610612740,NOP,New Orleans Pelicans,22000646,2021-03-21,NOP @ DEN,1,240,113,...,1.118614,1573.571673,0.682927,0.658537,-1.609756,-0.829268,0.012037,0.000385,118.329024,0
603,22020,1610612738,BOS,Boston Celtics,22000167,2021-03-21,BOS vs. ORL,1,241,112,...,1.206567,1409.834103,-6.975610,0.121951,-0.707317,3.390244,-0.073447,0.051740,-69.109142,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,22020,1610612739,CLE,Cleveland Cavaliers,22000715,2021-03-29,CLE @ UTA,0,240,75,...,1.067771,1633.419678,13.500483,0.471498,-0.020290,5.045411,0.127929,-0.090327,213.101879,0
667,22020,1610612738,BOS,Boston Celtics,22000708,2021-03-29,BOS vs. NOP,0,239,109,...,1.095866,1484.942081,2.985990,3.221256,0.752174,1.558454,0.027477,0.053153,-9.129655,1
668,22020,1610612744,GSW,Golden State Warriors,22000716,2021-03-29,GSW vs. CHI,1,241,116,...,1.045788,1458.176323,-0.047980,-1.395960,1.855051,0.589899,0.015085,-0.040992,-19.078889,1
669,22020,1610612758,SAC,Sacramento Kings,22000714,2021-03-29,SAC @ SAS,1,239,132,...,1.149306,1510.697397,-5.364004,-1.277048,-1.432760,2.731041,-0.047194,0.124470,21.820636,1
