In [2]:
### Basic Imports
import pandas as pd
import numpy as np

### Graphic libraries
import matplotlib.pyplot as plt
import seaborn as sns 

import statistics 


### Importing the relevant ML libraries 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score,roc_curve,auc,recall_score,f1_score,precision_score,classification_report,confusion_matrix,auc

import statsmodels.api as sm

### Some cosmetics add-ons
plt.style.use('fivethirtyeight')
%matplotlib inline

# Load dataset

In [3]:
df = pd.read_csv('nba_2020_clean.csv')

# Get significant features

In [13]:
features_list = ['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x', 'DIS_DEFRATE_x', 'DIS_PTS_x', 'DIS_AST_x', 'DIS_OREB_x', 'DIS_DREB_x']
target = 'WL_x'

train_data = df[(df['GAME_PLAYED_x'] <= 30) | (df['GAME_PLAYED_y'] <= 30 )]
test_data = df[(df['GAME_PLAYED_x'] > 30) & (df['GAME_PLAYED_y'] > 30 )]

# Creating our independent and dependent variables
x = train_data[features_list]
y = train_data['PLUS_MINUS_x']

model = sm.OLS(y,x)
results = model.fit()

features_list = []
for i in range(len(x.keys())):
    if results.pvalues[i] <= 0.08:
        features_list.append(model.exog_names[i])

features_list


['DIS_ELO_x', 'HOME_COURT_x', 'DIS_OFFRATE_x']

# Get f1 score of each model

In [14]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

# game = df.iloc[-1]

for model_name in models_dict:
    X_train = train_data[features_list]
    X_test = test_data[features_list]
    y_train = train_data['WL_x']
    y_test = test_data['WL_x']
    
    m = models_dict[model_name]

    if model_name == 'Linear Regression':
        y_train = train_data['PLUS_MINUS_x']
    
    m.fit(X_train, y_train)
    predictions = m.predict(X_test)

    if model_name == 'Linear Regression':
        for i, v in enumerate(predictions):
            if v > 0:
                predictions[i] = 1
            else:
                predictions[i] = 0
    
    f1 = f1_score(y_test,predictions)

    # prediction_data[model_name] = f1_score
    
    print(model_name + ':', f1)


Linear Regression: 0.7008547008547008
Logistic Regression: 0.703125
Naive Bayes: 0.7207207207207208
Decision Trees: 0.5344827586206896
SVM linear: 0.7022900763358778
SVM rbf: 0.6666666666666665
Random Forest: 0.5528455284552846


# Add ensemble model prediction column for each row in test data

In [15]:
models_dict = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear'),
    'SVM rbf': svm.SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(n_estimators = 100),
    # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}
test_data['Prediction'] = 0
for index, row in test_data.iterrows():
    test_df = row.to_frame().T
    prediction_data = {}
    for model_name in models_dict:
        X_train = train_data[features_list]
        X_test = test_df[features_list]
        y_train = train_data['WL_x']
        
        m = models_dict[model_name]

        if model_name == 'Linear Regression':
            y_train = train_data['PLUS_MINUS_x']
        
        m.fit(X_train, y_train)
        prediction = m.predict(X_test)

        if model_name == 'Linear Regression':
            if prediction[0] > 0:
                prediction[0] = 1
            else:
                prediction[0] = 0
                        
        prediction_data[model_name] = prediction[0]
        
    final_prediction = 0
    for k, v in prediction_data.items():
        final_prediction += v

    final_prediction = round(final_prediction / 7)
    test_data['Prediction'][index] = final_prediction


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Prediction'][index] = final_prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predict

# Get f1 score of ensemble model

In [16]:
y_test = test_data['WL_x']
f1 = f1_score(y_test,test_data['Prediction'])
print("Ensemble model's f1 score: {}".format(round(f1,4)))

Ensemble model's f1 score: 0.7107
