In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import recall_score, confusion_matrix, f1_score, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from collections import Counter
import time
import joblib

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/nba_data/nba_logreg.csv")

In [4]:
data.head()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0


In [18]:
selected_features = ['GP', 'MIN', 'PTS', 'FG%', '3P Made', '3P%', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TOV']
X = data[selected_features]
y = data["TARGET_5Yrs"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
class_counts = Counter(y_train)
negative_class = class_counts[0]
positive_class = class_counts[1]

scale_pos_weight = negative_class / positive_class

In [6]:
# The initial list of models to test
models ={
    "logisticregression": LogisticRegression(class_weight='balanced', random_state=42),
    "gradientboostingclassifier": GradientBoostingClassifier(random_state=42),
    "xgbclassifier": XGBClassifier(scale_pos_weight=scale_pos_weight, eval_metric='logloss', random_state=42),
    "balancedrandomforestclassifier": BalancedRandomForestClassifier(sampling_strategy='all', replacement=True, bootstrap=False, random_state=42),
    "easyensembleclassifier": EasyEnsembleClassifier(n_estimators=100, random_state=42)
}

# a custom scoring function because maximizig the recall as the client wants
# can result in a tn=0 even though the recall equals 1.
# So we have to care about a minimum specificity value, defined here as 0.49 (open to discussion)

def custom_score(classifier, X, y_true):
  y_pred = classifier.predict(X)
  recall = recall_score(y_true, y_pred)
  specificity = recall_score(y_true, y_pred, pos_label=0)
  return recall if specificity > 0.49 else 0
  print(f"Confusion Matrix:\n{cm}")
  print(f"Recall: {recall:.2f}")
  print(f"Specificity: {specificity:.2f}")

In [7]:
# Pipelines for different models
pipelines = {}

# We need to adress the missing data
imputer = SimpleImputer(strategy='mean')
for model_name, model in models.items():
  steps = []
  #XG boost can deal with missing data
  if model_name != "XGBoost":
    steps.append(('imputer', imputer))

  # only Logreg needs scaling
  if model_name == "Logistic Regression":
    steps.append(('scaler',StandardScaler()))

  # No need to adress data imbalance explicity as we already did,
  # or the models take care of it internally

  steps.append((model_name, model))

  pipelines[model_name] = ImbPipeline(steps=steps)


In [8]:
# to maintain the proportion of each class since we have imbalanced data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

candidate_models = []
threshold = 0.6

for model_name, pipeline in pipelines.items():
  print(f"Evaluating model: {model_name}")

  try:
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=custom_score)
    mean_cv_score = np.mean(cv_scores)

    print(f"Initial CV recall Scores of {model_name}: {cv_scores}")
    print(f"Initial Mean CV Recall of {model_name}: {mean_cv_score}\n")

    if mean_cv_score > threshold:
        candidate_models.append((model_name, pipeline, mean_cv_score))

  except Exception as e:
    print(f"Error evaluating model {model_name}: {e}")

# picking the top 3 candidates
candidate_models = sorted(candidate_models, key=lambda x:x[2], reverse=True)[:3]
print(f"Selected candidate models for hyperparameter tuning: {[model[0] for model in candidate_models]}")

Evaluating model: logisticregression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Initial CV recall Scores of logisticregression: [0.64661654 0.62406015 0.68181818 0.74242424 0.66666667]
Initial Mean CV Recall of logisticregression: 0.6723171565276828

Evaluating model: gradientboostingclassifier
Initial CV recall Scores of gradientboostingclassifier: [0.         0.         0.79545455 0.83333333 0.75757576]
Initial Mean CV Recall of gradientboostingclassifier: 0.47727272727272735

Evaluating model: xgbclassifier
Initial CV recall Scores of xgbclassifier: [0.         0.         0.73484848 0.76515152 0.6969697 ]
Initial Mean CV Recall of xgbclassifier: 0.43939393939393945

Evaluating model: balancedrandomforestclassifier
Initial CV recall Scores of balancedrandomforestclassifier: [0.70676692 0.63909774 0.68181818 0.75       0.65151515]
Initial Mean CV Recall of balancedrandomforestclassifier: 0.6858395989974937

Evaluating model: easyensembleclassifier
Initial CV recall Scores of easyensembleclassifier: [0.54135338 0.57894737 0.63636364 0.67424242 0.65151515]
Initial 

In [9]:
# Parameter grids for each model

param_grids = {
    'logisticregression': {
        'logisticregression__C': [0.1, 1, 10],
        'logisticregression__penalty': ['l1', 'l2'],
        'logisticregression__solver': ['liblinear', 'saga']  # Supports l1 regularization
    },
    'gradientboostingclassifier': {
        'gradientboostingclassifier__n_estimators': [100, 200, 300],
        'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
        'gradientboostingclassifier__max_depth': [3, 5, 7],
        'gradientboostingclassifier__subsample': [0.8, 1.0]
    },
    'xgbclassifier': {
        'xgbclassifier__n_estimators': [100, 200, 300],
        'xgbclassifier__learning_rate': [0.01, 0.1, 0.2],
        'xgbclassifier__max_depth': [3, 5, 7],
    },
    'balancedrandomforestclassifier': {
        'balancedrandomforestclassifier__n_estimators': [100, 200, 300],
        'balancedrandomforestclassifier__max_depth': [None, 10, 20],
        'balancedrandomforestclassifier__min_samples_split': [2, 5, 10]
    },
    'easyensembleclassifier': {
        'easyensembleclassifier__n_estimators': [10, 50, 100]
        }
}

In [10]:
best_models_configs = {}
for model_name, pipeline, score in candidate_models:
  print(f"Performing Grid search on: {model_name}")

  param_grid = param_grids.get(model_name, {})


  if param_grid:
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=custom_score,
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_models_configs[model_name] = {
        'best classifier': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        "best score": grid_search.best_score_,
    }
    print(f"Best params for {model_name}: {grid_search.best_params_}")
    print(f"Best CV score for {model_name}: {grid_search.best_score_}\n")

Performing Grid search on: balancedrandomforestclassifier
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best params for balancedrandomforestclassifier: {'balancedrandomforestclassifier__max_depth': None, 'balancedrandomforestclassifier__min_samples_split': 10, 'balancedrandomforestclassifier__n_estimators': 100}
Best CV score for balancedrandomforestclassifier: 0.7085440874914559

Performing Grid search on: logisticregression
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params for logisticregression: {'logisticregression__C': 10, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}
Best CV score for logisticregression: 0.6798587377534746

Performing Grid search on: easyensembleclassifier
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best params for easyensembleclassifier: {'easyensembleclassifier__n_estimators': 10}
Best CV score for easyensembleclassifier: 0.6255297334244703



In [12]:
top_two_models = sorted(best_models_configs.items(), key=lambda x: x[1]['best score'], reverse=True)[:2]

final_model_performance = {}

#retraining the two models on the whole training dataset and testing them
for model_name, config in top_two_models:
  print(f"Retraining and evaluating the model: {model_name}")

  best_pipeline = config['best classifier']
  best_pipeline.fit(X_train, y_train)

  start_time = time.time()
  y_pred = best_pipeline.predict(X_test)
  inference_time = time.time() - start_time

  # Evaluating the model performance

  recall = recall_score(y_test, y_pred)
  specificity = recall_score(y_test, y_pred, pos_label=0)
  f1 = f1_score(y_test, y_pred)
  accuracy = accuracy_score(y_test, y_pred)

  final_model_performance[model_name] = {
    "Inference Time (seconds)": inference_time,
    "Recall": recall,
    "Specificity": specificity,
    "F1 Score": f1,
    "Accuracy": accuracy,
    "Best Params": config['best_params'],
    "Best CV Score": config['best score']
  }


 # printing the final comparison

print("Comparison of the final two models:")
for model_name, metrics in final_model_performance.items():
  print(f"\nModel: {model_name}")
  for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")

Retraining and evaluating the model: balancedrandomforestclassifier
Retraining and evaluating the model: logisticregression
Comparison of the final two models:

Model: balancedrandomforestclassifier
Inference Time (seconds): 0.009450674057006836
Recall: 0.7810650887573964
Specificity: 0.6363636363636364
F1 Score: 0.7833827893175074
Accuracy: 0.7276119402985075
Best Params: {'balancedrandomforestclassifier__max_depth': None, 'balancedrandomforestclassifier__min_samples_split': 10, 'balancedrandomforestclassifier__n_estimators': 100}
Best CV Score: 0.7085440874914559

Model: logisticregression
Inference Time (seconds): 0.00154876708984375
Recall: 0.7218934911242604
Specificity: 0.7171717171717171
F1 Score: 0.7648902821316614
Accuracy: 0.7201492537313433
Best Params: {'logisticregression__C': 10, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}
Best CV Score: 0.6798587377534746


In [14]:
# seeing that the balancedrandomforestclassifier is better in CV and final test
# in terms of recall, which is the important metric for our client
# and while the inference is 9 times more than logistic regression
# it is still small and acceptable, so we go with balancedrandomforestclassifier

In [17]:
best_model_name = list(final_model_performance.keys())[0]
best_model = top_two_models[0][1]['best classifier']

# Save the model
joblib.dump(best_model, f'/content/drive/MyDrive/Colab Notebooks/nba_data/{best_model_name}_final_model.joblib')
print(f"Final model '{best_model_name}' saved as {best_model_name}_final_model.joblib")

Final model 'balancedrandomforestclassifier' saved as balancedrandomforestclassifier_final_model.joblib
