# Naive Model (Play-Level Data) v1

__Date:__ 11/5/2023 <br>
__Purpose:__ Program that uses the play-level data to predict expected yards gained <br>
__Model and data specifications:__
- Data: Plays dataframe and some stuff from games df (no outside supplemental data)
- Models: Basic supervised learning 
<br>__Updates from previous version:__ Includes hyper-parameter tuning

## Step 0: Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, roc_curve, RocCurveDisplay, r2_score
import time 
import sys
sys.path.append('../preprocessing')
from Preprocessing_v1 import *
from DataLoader import load_data

# Regression models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor, XGBClassifier

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


## Step 1: Load Data

In [11]:
# Load data
[games_df, players_df, plays_df, tracking_df] = load_data()

loaded games df
shape: (136, 9)
-----
loaded players df
shape: (1683, 7)
-----
loaded plays df
shape: (12486, 35)
-----
loading tracking frames...
loaded tracking frames
shape: (12187398, 17)
returning 4 frames


## Step 2: Define Helper Functions

In [12]:
# Function that finishes preprocessing and does the train test split of plays df
def plays_train_test_split(plays_df_clean):
    # Drop game and play ID
    plays_df_clean = plays_df_clean.drop(['gameId', 'playId'], axis = 1)
    
    # Get X and y matrices
    y = plays_df_clean["TARGET"]
    X = plays_df_clean.drop(["TARGET"], axis = 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)
    
    return X_train, X_test, y_train, y_test

In [13]:
# Helper function that does cross validation and gives best model
def run_cv(model, param_grid, X_train, y_train, X_val):
    print("training " + type(model).__name__)

    # Define the cross-validation strategy
    cv = KFold(n_splits=5)

    # Get the type of scoring for the grid search depending on regression or classification
    if model.__class__ in [LinearRegression, Lasso, Ridge, ElasticNet, SVR, RandomForestRegressor, AdaBoostRegressor, XGBRegressor]:
        scoring_metric = 'neg_mean_squared_error'
    else:
        scoring_metric = 'f1_weighted'

    # Perform grid search with cross-validation
    start_time = time.time()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring=scoring_metric)
    train_time = time.time() - start_time
    
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_val)

    # Return the best model, y_pred
    return grid_search, y_pred, train_time # return the metric and model

In [14]:
# Helper fucntions that do cross validation

def run_lasso(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(Lasso(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_ridge(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(Ridge(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_elastic_net(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2],
                  'l1_ratio': [0.1, 0.25, 0.5, 0.75, 0.9]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(ElasticNet(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_SVR(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'C': [0.01, 0.1, 1, 2, 10],
                  'kernel': ['linear', 'poly', 'rbf']}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(SVR(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_random_forest_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [100, 500, 1000],
                  'max_depth': [100, None]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(RandomForestRegressor(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_adaboost_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [50, 100, 200],
                  'learning_rate': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(AdaBoostRegressor(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_xgb_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'max_depth': [3, 5, 6, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'subsample': [0.5, 0.7, 1]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(XGBRegressor(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_xgb_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'max_depth': [3, 5, 6, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'subsample': [0.5, 0.7, 1]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(XGBClassifier(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_logistic_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid for regularization strength
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet', None]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(LogisticRegression(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_SVC(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'C': [0.01, 0.1, 1, 2, 10],
                  'kernel': ['linear', 'poly', 'rbf']}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(SVC(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_random_forest_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [100, 500, 1000],
                  'max_depth': [100, None]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(RandomForestClassifier(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_adaboost_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [50, 100, 200],
                  'learning_rate': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(AdaBoostClassifier(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time


def run_gaussianNB(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(GaussianNB(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_perceptron(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet']}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(Perceptron(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

# Linear regression (no tuning necessary)
def run_linear_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(LinearRegression(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

## Step 3: Train models

In [15]:
include_nfl_features_params = [True, False]
bin_ouput_params = [True, False]

results_df = pd.DataFrame(columns = ['model', 'regression/classification', 'train_time',
                                                'MSE pre-bin', 'r2_score',
                                                'bin_output', 'include_nfl_features',
                                                'f1_score','confusion_matrix', 'accuracy_score'])
# Excludes run_SVR, run_SVC
regression_models = [run_linear_reg, run_lasso, run_ridge, run_elastic_net, run_random_forest_reg, run_adaboost_reg, run_xgb_reg]
classification_models = [run_logistic_classifier, run_perceptron, run_gaussianNB, run_random_forest_classifier, run_adaboost_classifier, run_xgb_classifier]

for include_nfl_features in include_nfl_features_params:
    for bin_output in bin_ouput_params: 
        # Prepreocessing 
        plays_df_clean = preprocess_plays_df_naive_models(plays_df, games_df, include_nfl_features, bin_output)

        # Train test split
        X_train, X_test, y_train, y_test = plays_train_test_split(plays_df_clean)

        # Check if we need to do regression first
        if not bin_output:
            for model_class in regression_models:
                # Train model
                model, y_pred, train_time = model_class(X_train, y_train, X_test)

                # Get accuracy
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                # Bin both and get post-binned metrics
                bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
                y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
                y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

                f1_metric = f1_score(y_test_binned, y_pred_binned, average = 'weighted')
                confusion_mat = confusion_matrix(y_test_binned, y_pred_binned)
                accuracy = accuracy_score(y_test_binned, y_pred_binned)

                # Record result
                new_row = pd.DataFrame({
                    'model': [str(model.best_estimator_)],
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [mse], 
                    'r2_score': [r2],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)

        else:
            for model_class in classification_models:
                # Train model
                model, y_pred, train_time = model_class(X_train, y_train, X_test)

                # Get accuracy metrics
                f1_metric = f1_score(y_test, y_pred, average = 'weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                accuracy = accuracy_score(y_test, y_pred)

                # Record result
                new_row = pd.DataFrame({
                    'model': str(model.best_estimator_),
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [np.nan], 
                    'r2_score': [np.nan],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)


results_df.head()

final plays data shape: (6840, 289)
training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

training Perceptron


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training GaussianNB


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training RandomForestClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training AdaBoostClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training XGBClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


final plays data shape: (6840, 289)
training LinearRegression


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training Lasso


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training Ridge


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training ElasticNet


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training RandomForestRegressor


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training AdaBoostRegressor


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training XGBRegressor


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

final plays data shape: (6840, 289)
training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

training Perceptron


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training GaussianNB


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training RandomForestClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training AdaBoostClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training XGBClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


final plays data shape: (6840, 289)
training LinearRegression


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training Lasso


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training Ridge


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training ElasticNet


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training RandomForestRegressor


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training AdaBoostRegressor


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

training XGBRegressor


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Library/Framew

Unnamed: 0,model,regression/classification,train_time,MSE pre-bin,r2_score,bin_output,include_nfl_features,f1_score,confusion_matrix,accuracy_score
0,LogisticRegression(),classification,8e-06,,,True,True,0.132776,"[[0, 0, 0, 0, 90, 0, 0], [0, 0, 0, 0, 184, 0, ...",0.292982
1,Perceptron(penalty='l2'),classification,7e-06,,,True,True,0.132776,"[[0, 0, 0, 0, 90, 0, 0], [0, 0, 0, 0, 184, 0, ...",0.292982
2,GaussianNB(),classification,1.2e-05,,,True,True,0.183787,"[[24, 14, 8, 4, 16, 11, 13], [37, 41, 28, 11, ...",0.179532
3,RandomForestClassifier(),classification,7e-06,,,True,True,0.210305,"[[1, 8, 7, 5, 53, 15, 1], [0, 19, 15, 14, 111,...",0.267836
4,"AdaBoostClassifier(learning_rate=1, n_estimato...",classification,8e-06,,,True,True,0.227863,"[[6, 8, 1, 3, 61, 7, 4], [6, 16, 7, 12, 117, 2...",0.283041


## Step 4: Get best model

In [17]:
results_df.sort_values(by = 'f1_score', ascending = False).head()

Unnamed: 0,model,regression/classification,train_time,MSE pre-bin,r2_score,bin_output,include_nfl_features,f1_score,confusion_matrix,accuracy_score
4,"AdaBoostClassifier(learning_rate=1, n_estimato...",classification,8e-06,,,True,True,0.227863,"[[6, 8, 1, 3, 61, 7, 4], [6, 16, 7, 12, 117, 2...",0.283041
18,"XGBClassifier(base_score=None, booster=None, c...",classification,5e-06,,,True,False,0.218626,"[[0, 6, 9, 9, 48, 16, 2], [3, 23, 19, 15, 95, ...",0.255556
5,"XGBClassifier(base_score=None, booster=None, c...",classification,8e-06,,,True,True,0.218369,"[[3, 8, 10, 4, 42, 18, 5], [3, 25, 16, 19, 83,...",0.246784
16,RandomForestClassifier(),classification,7e-06,,,True,False,0.217417,"[[3, 2, 9, 5, 55, 13, 3], [0, 14, 19, 16, 112,...",0.266667
17,"AdaBoostClassifier(learning_rate=1, n_estimato...",classification,6e-06,,,True,False,0.217018,"[[4, 9, 2, 7, 58, 7, 3], [14, 12, 9, 16, 112, ...",0.27076


In [19]:
results_df.sort_values(by = 'f1_score', ascending = False).iloc[0]['model']

'AdaBoostClassifier(learning_rate=1, n_estimators=200)'

## Depreciated - run through on one dataset/model

In [None]:
# Get X and y matrices
y = plays_df_clean["TARGET"]
X = plays_df_clean.drop(["TARGET"], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)

print('X_train shape : ', X_train.shape)
print('y_train shape : ', y_train.shape)

print('X_test shape  : ', X_test.shape)
print('y_test shape  : ', y_test.shape)

X_train shape :  (5127, 348)
y_train shape :  (5127,)
X_test shape  :  (1710, 348)
y_test shape  :  (1710,)


In [61]:
model = LinearRegression()

model.__class__

# Get the type of scoring for the grid search depending on regression or classification
if model.__class__ in [LinearRegression, Lasso, Ridge, ElasticNet, SVR, RandomForestRegressor, AdaBoostRegressor, XGBRegressor]:
    scoring_metric = 'neg_mean_squared_error'
else:
    scoring_metric = 'f1_weighted'

print(scoring_metric)

neg_mean_squared_error


In [57]:
# Train model

    

grid_search = GridSearchCV(estimator=LinearRegression(), param_grid={}, cv=KFold(5), scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)


# Get accuracy
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Bin both and get post-binned metrics
bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

f1_metric = f1_score(y_test_binned, y_pred_binned, average = 'weighted')
confusion_mat = confusion_matrix(y_test_binned, y_pred_binned)
accuracy = accuracy_score(y_test_binned, y_pred_binned)

# Record result
new_row = pd.DataFrame({
    'model': [str(model.best_estimator_)],
    'regression/classification': ['classification'], 
    'train_time': [train_time],
    'MSE pre-bin': [mse], 
    'r2_score': [r2],
    'bin_output': [bin_output], 
    'include_nfl_features': [include_nfl_features],
    'f1_score': [f1_metric],
    'confusion_matrix': [confusion_mat], 
    'accuracy_score': [accuracy]
})
print(new_row)

                model regression/classification  train_time   MSE pre-bin  \
0  Lasso(alpha=0.001)            classification    0.000009  6.190857e+13   

       r2_score  bin_output  include_nfl_features  f1_score  \
0 -1.452745e+12       False                 False  0.209398   

                                    confusion_matrix  accuracy_score  
0  [[1, 2, 4, 7, 46, 28, 2], [3, 2, 3, 25, 88, 61...        0.269591  


In [46]:
# Prepreocessing 
plays_df_clean = preprocess_plays_df_naive_models(plays_df, games_df, True, False)

# Train test split
X_train, X_test, y_train, y_test = plays_train_test_split(plays_df_clean)

start_time = time.time()
model = XGBRegressor(learning_rate=1, n_estimators=200)
model.fit(X = X_train, y = y_train)
print("training time: " + str(time.time() - start_time))

final plays data shape: (6840, 289)
training time: 0.838698148727417


In [47]:
y_pred = model.predict(X_test)

In [50]:
y_pred

array([0.6846691, 2.6349516, 2.5227206, ..., 7.434175 , 9.14129  ,
       1.8764133], dtype=float32)

In [51]:
y_test

3318    0
2136    0
6787    1
6470    3
4985    4
       ..
3087    3
5839    8
570     3
5366    9
4596    5
Name: TARGET, Length: 1710, dtype: int64

In [49]:
print("Confusion matrix: \n" + str(confusion_matrix(y_test, y_pred)))
print("F1 score: " + str(round(f1_score(y_test, y_pred, average='weighted'), 3)))
print("Accuracy score: " + str(round(accuracy_score(y_test, y_pred), 3)))

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [23]:
print("MSE: \n" + str(mean_squared_error(y_test, y_pred)))

MSE: 
51.88070175438597


In [52]:
bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

In [53]:
print("Confusion matrix: \n" + str(confusion_matrix(y_test_binned, y_pred_binned)))
print("F1 score: " + str(round(f1_score(y_test_binned, y_pred_binned, average='weighted'), 3)))
print("Accuracy score: " + str(round(accuracy_score(y_test_binned, y_pred_binned), 3)))

Confusion matrix: 
[[  5  11   7  14  25  22   6]
 [ 12  19  16  23  38  52  24]
 [ 12  19  22  22  48  34  28]
 [ 17  23  10  33  60  64  18]
 [ 28  26  36  66 140 147  58]
 [ 13  23  11  47  86 108  37]
 [  8  12  12  25  58  63  22]]
F1 score: 0.2
Accuracy score: 0.204
