In [6]:
### Basic Imports
import pandas as pd
import numpy as np

### Graphic libraries
import matplotlib.pyplot as plt
import seaborn as sns 

import statistics 


### Importing the relevant ML libraries 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score,roc_curve,auc,recall_score,f1_score,precision_score,classification_report,confusion_matrix,auc

import statsmodels.api as sm

### Some cosmetics add-ons
plt.style.use('fivethirtyeight')
%matplotlib inline

# Load dataset

In [2]:
df = pd.read_csv('nba_2020_clean.csv')

# Get significant features

In [27]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

train_data = df[(df['GAME_PLAYED_x'] <= 41) | (df['GAME_PLAYED_y'] <= 41 )]
test_data = df[(df['GAME_PLAYED_x'] > 41) & (df['GAME_PLAYED_y'] > 41 )]

# Creating our independent and dependent variables
x = train_data[features_list]
y = train_data['PLUS_MINUS_x']

model = sm.OLS(y,x)
results = model.fit()

features_list = []
for i in range(len(x.keys())):
    if results.pvalues[i] <= 0.1:
        features_list.append(model.exog_names[i])

features_list


['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x']

In [28]:
results.summary()

0,1,2,3
Dep. Variable:,PLUS_MINUS_x,R-squared (uncentered):,0.086
Model:,OLS,Adj. R-squared (uncentered):,0.073
Method:,Least Squares,F-statistic:,6.999
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,7.59e-09
Time:,00:29:01,Log-Likelihood:,-2462.4
No. Observations:,605,AIC:,4941.0
Df Residuals:,597,BIC:,4976.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
DIS_ELO_x,0.0337,0.012,2.855,0.004,0.011,0.057
HOME_COURT_x,1.4695,0.830,1.770,0.077,-0.161,3.100
DIS_OFFRATE_x,40.0128,12.594,3.177,0.002,15.279,64.746
DIS_DEFRATE_x,-3.8933,2.549,-1.528,0.127,-8.898,1.112
DIS_PTS_x,-0.0845,0.096,-0.882,0.378,-0.273,0.104
DIS_AST_x,0.1900,0.213,0.892,0.373,-0.229,0.609
DIS_OREB_x,-0.2326,0.271,-0.859,0.391,-0.765,0.299
DIS_DREB_x,0.0383,0.220,0.174,0.862,-0.394,0.471

0,1,2,3
Omnibus:,4.919,Durbin-Watson:,1.946
Prob(Omnibus):,0.085,Jarque-Bera (JB):,5.287
Skew:,-0.132,Prob(JB):,0.0711
Kurtosis:,3.374,Cond. No.,1410.0


# Get performance of each model

In [29]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

performance_table = {'f1':[],'acc':[]}

for model_name in models_dict:
    X_train = train_data[features_list]
    X_test = test_data[features_list]
    y_train = train_data['WL_x']
    y_test = test_data['WL_x']
    
    m = models_dict[model_name]

    if model_name == 'Linear Regression':
        y_train = train_data['PLUS_MINUS_x']
    
    m.fit(X_train, y_train)
    predictions = m.predict(X_test)

    if model_name == 'Linear Regression':
        for i, v in enumerate(predictions):
            if v > 0:
                predictions[i] = 1
            else:
                predictions[i] = 0
    
    f1 = round(f1_score(y_test,predictions),4)
    acc = round(accuracy_score(y_test,predictions),4)
    
    performance_table['f1'].append(f1)
    performance_table['acc'].append(acc)

# for k, v in performance_table.items():
#     performance_df = pd.DataFrame(v,index=list(models_dict.keys()))
#     print(performance_df)

performance_df = pd.DataFrame(performance_table, index=list(models_dict.keys()))
performance_df

Unnamed: 0,f1,acc
Linear Regression,0.7792,0.7424
Logistic Regression,0.7532,0.7121
Naive Bayes,0.7792,0.7424
Decision Trees,0.6494,0.5909
SVM linear,0.7467,0.7121
SVM rbf,0.7561,0.697
Random Forest,0.6923,0.6364


# Add ensemble model prediction column for each row in test data

In [30]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    # 'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}
test_data['Prediction'] = 0
for index, row in test_data.iterrows():
    test_df = row.to_frame().T
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_data[features_list]
        X_test = test_df[features_list]
        y_train = train_data['WL_x']
        
        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_data['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 5)
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

# Get performance of ensemble model

In [31]:
y_test = test_data['WL_x']
f1 = f1_score(y_test,test_data['Prediction'])
acc = accuracy_score(y_test,test_data['Prediction'])
print("f1 score: {}\nacc score: {}".format(round(f1,4),round(acc,4)))

f1 score: 0.7632
acc score: 0.7273


# Ensemble performance - prediction for every game based on all previous games

In [12]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

# predict and test 2nd half of season
test_data = df[(df['GAME_PLAYED_x'] > 41) & (df['GAME_PLAYED_y'] > 41)]
test_data['Prediction'] = 0

models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    # 'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

for index, row in test_data.iterrows():
    # Creating training dataset for all previous games
    idx = df.index[df['GAME_ID'] == row['GAME_ID']]
    train_df = df.loc[:idx[0]-1]
    test_df = row.to_frame().T

    # Creating our independent and dependent variables
    x = train_df[features_list]
    y = train_df['PLUS_MINUS_x']

    model = sm.OLS(y,x)
    results = model.fit()

    # Extracting significant features
    significant_features = []
    for i in range(len(x.keys())):
        if results.pvalues[i] <= 0.1:
            significant_features.append(model.exog_names[i])

    # Predicting game outcome
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_df[significant_features]
        y_train = train_df['WL_x']
        X_test = test_df[significant_features]

        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_df['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 5)

    # Appending predicted outcome to test_data
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

In [13]:
y_test = test_data['WL_x']

f1 = f1_score(y_test,test_data['Prediction'])
accuracy = accuracy_score(y_test,test_data['Prediction'])
precision = precision_score(y_test,test_data['Prediction'])
recall = recall_score(y_test,test_data['Prediction'])

print("Ensemble model")
print("-"*20)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1 score: {}".format(round(accuracy,4),round(precision,4),round(recall,4),round(f1,4)))

Ensemble model
--------------------
accuracy: 0.6602
precision: 0.6685
recall: 0.6503
f1 score: 0.6593


In [None]:
test_data

In [14]:
results.summary()

0,1,2,3
Dep. Variable:,PLUS_MINUS_x,R-squared (uncentered):,0.115
Model:,OLS,Adj. R-squared (uncentered):,0.108
Method:,Least Squares,F-statistic:,15.57
Date:,"Sat, 08 May 2021",Prob (F-statistic):,1.1100000000000001e-21
Time:,14:20:45,Log-Likelihood:,-3941.8
No. Observations:,966,AIC:,7900.0
Df Residuals:,958,BIC:,7939.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
DIS_ELO_x,0.0333,0.007,4.481,0.000,0.019,0.048
HOME_COURT_x,0.8715,0.670,1.300,0.194,-0.444,2.187
DIS_OFFRATE_x,45.9468,11.886,3.866,0.000,22.621,69.273
DIS_DEFRATE_x,-3.0423,2.454,-1.240,0.215,-7.858,1.773
DIS_PTS_x,-0.0372,0.094,-0.397,0.691,-0.221,0.147
DIS_AST_x,-0.1199,0.183,-0.654,0.513,-0.479,0.240
DIS_OREB_x,-0.0580,0.253,-0.230,0.818,-0.554,0.438
DIS_DREB_x,0.1230,0.203,0.607,0.544,-0.274,0.520

0,1,2,3
Omnibus:,6.637,Durbin-Watson:,2.097
Prob(Omnibus):,0.036,Jarque-Bera (JB):,8.818
Skew:,-0.013,Prob(JB):,0.0122
Kurtosis:,3.467,Cond. No.,2230.0
