In [13]:
### Basic Imports
import pandas as pd
import numpy as np

### Graphic libraries
import matplotlib.pyplot as plt
import seaborn as sns 

import statistics 


### Importing the relevant ML libraries 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score,roc_curve,auc,recall_score,f1_score,precision_score,classification_report,confusion_matrix,auc

import statsmodels.api as sm

### Some cosmetics add-ons
plt.style.use('fivethirtyeight')
%matplotlib inline

# Load dataset

In [18]:
df = pd.read_csv('nba_2020_clean.csv')

# Get significant features

In [19]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

train_data = df[(df['GAME_PLAYED_x'] <= 41) | (df['GAME_PLAYED_y'] <= 41 )]
test_data = df[(df['GAME_PLAYED_x'] > 41) & (df['GAME_PLAYED_y'] > 41 )]

# Creating our independent and dependent variables
x = train_data[features_list]
y = train_data['PLUS_MINUS_x']

model = sm.OLS(y,x)
results = model.fit()

features_list = []
for i in range(len(x.keys())):
    if results.pvalues[i] <= 0.05:
        features_list.append(model.exog_names[i])

features_list


['DIS_ELO_x', 'DIS_OFFRATE_x']

In [16]:
results.summary()

0,1,2,3
Dep. Variable:,PLUS_MINUS_x,R-squared (uncentered):,0.084
Model:,OLS,Adj. R-squared (uncentered):,0.072
Method:,Least Squares,F-statistic:,6.847
Date:,"Tue, 30 Mar 2021",Prob (F-statistic):,1.25e-08
Time:,10:05:45,Log-Likelihood:,-2462.9
No. Observations:,605,AIC:,4942.0
Df Residuals:,597,BIC:,4977.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
DIS_ELO_x,0.0333,0.012,2.857,0.004,0.010,0.056
HOME_COURT_x,1.2781,0.847,1.509,0.132,-0.385,2.941
DIS_OFFRATE_x,36.5238,12.626,2.893,0.004,11.727,61.321
DIS_DEFRATE_x,-3.6408,2.683,-1.357,0.175,-8.910,1.629
DIS_PTS_x,-0.0200,0.099,-0.202,0.840,-0.214,0.174
DIS_AST_x,0.1882,0.220,0.854,0.394,-0.245,0.621
DIS_OREB_x,-0.3213,0.276,-1.162,0.246,-0.864,0.222
DIS_DREB_x,-0.0380,0.219,-0.174,0.862,-0.467,0.391

0,1,2,3
Omnibus:,5.46,Durbin-Watson:,1.994
Prob(Omnibus):,0.065,Jarque-Bera (JB):,5.556
Skew:,0.173,Prob(JB):,0.0622
Kurtosis:,3.318,Cond. No.,1410.0


# Get performance of each model

In [55]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

performance_table = {'f1':[],'acc':[]}

for model_name in models_dict:
    X_train = train_data[features_list]
    X_test = test_data[features_list]
    y_train = train_data['WL_x']
    y_test = test_data['WL_x']
    
    m = models_dict[model_name]

    if model_name == 'Linear Regression':
        y_train = train_data['PLUS_MINUS_x']
    
    m.fit(X_train, y_train)
    predictions = m.predict(X_test)

    if model_name == 'Linear Regression':
        for i, v in enumerate(predictions):
            if v > 0:
                predictions[i] = 1
            else:
                predictions[i] = 0
    
    f1 = round(f1_score(y_test,predictions),4)
    acc = round(accuracy_score(y_test,predictions),4)
    
    performance_table['f1'].append(f1)
    performance_table['acc'].append(acc)

# for k, v in performance_table.items():
#     performance_df = pd.DataFrame(v,index=list(models_dict.keys()))
#     print(performance_df)

performance_df = pd.DataFrame(performance_table, index=list(models_dict.keys()))
performance_df

Unnamed: 0,f1,acc
Linear Regression,0.766,0.7179
Logistic Regression,0.7556,0.7179
Naive Bayes,0.7556,0.7179
Decision Trees,0.7143,0.6923
SVM linear,0.7556,0.7179
SVM rbf,0.7826,0.7436
Random Forest,0.7111,0.6667


# Add ensemble model prediction column for each row in test data

In [23]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    # 'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}
test_data['Prediction'] = 0
for index, row in test_data.iterrows():
    test_df = row.to_frame().T
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_data[features_list]
        X_test = test_df[features_list]
        y_train = train_data['WL_x']
        
        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_data['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 5)
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

# Get performance of ensemble model

In [56]:
y_test = test_data['WL_x']
f1 = f1_score(y_test,test_data['Prediction'])
acc = accuracy_score(y_test,test_data['Prediction'])
print("f1 score: {}\nacc score: {}".format(round(f1,4),round(acc,4)))

f1 score: 0.7556
acc score: 0.7179


# Ensemble performance - prediction for every game based on all previous games

In [8]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

# predict and test 2nd half of season
test_data = df[(df['GAME_PLAYED_x'] > 41) & (df['GAME_PLAYED_y'] > 41 )]
test_data['Prediction'] = 0

models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    # 'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

for index, row in test_data.iterrows():
    # Creating training dataset for all previous games
    idx = df.index[df['GAME_ID'] == row['GAME_ID']]
    # print(idx[0])
    train_df = df.loc[:idx[0]-1]
    # print(len(train_df))
    test_df = row.to_frame().T

    # Creating our independent and dependent variables
    x = train_df[features_list]
    y = train_df['PLUS_MINUS_x']

    model = sm.OLS(y,x)
    results = model.fit()

    # Extracting significant features
    features_list = []
    for i in range(len(x.keys())):
        if results.pvalues[i] <= 0.05:
            features_list.append(model.exog_names[i])

    # Predicting game outcome
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_df[features_list]
        X_test = test_df[features_list]
        y_train = train_df['WL_x']
        
        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_df['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 5)

    # Appending predicted outcome to test_data
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

In [58]:
y_test = test_data['WL_x']
f1 = f1_score(y_test,test_data['Prediction'])
acc = accuracy_score(y_test,test_data['Prediction'])
print("Ensemble model")
print("-"*20)
print("F1 score: {}\nAcc score: {}".format(round(f1,4),round(acc,4)))

Ensemble model
--------------------
F1 score: 0.7556
Acc score: 0.7179
