# Naive Model (Play-Level Data) v1

__Date:__ 11/5/2023 <br>
__Purpose:__ Program that uses the play-level data to predict expected yards gained <br>
__Model and data specifications:__
- Data: Plays dataframe and some stuff from games df (no outside supplemental data)
- Models: Basic supervised learning 
<br>__Updates from previous version:__ Includes hyper-parameter tuning

## Step 0: Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, roc_curve, RocCurveDisplay, r2_score
import time 
import sys
sys.path.append('../preprocessing')
from Preprocessing_v1 import *
from DataLoader import load_data

# Regression models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor, XGBClassifier

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


## Step 1: Load Data

In [4]:
# Load data
[games_df, players_df, plays_df, tracking_df] = load_data()

loaded games df
shape: (136, 9)
-----
loaded players df
shape: (1683, 7)
-----
loaded plays df
shape: (12486, 35)
-----
loading tracking frames...
loaded tracking frames
shape: (12187398, 17)
returning 4 frames


## Step 2: Define Helper Functions

In [5]:
# Function that finishes preprocessing and does the train test split of plays df
def plays_train_test_split(plays_df_clean):
    # Drop game and play ID
    plays_df_clean = plays_df_clean.drop(['gameId', 'playId'], axis = 1)
    
    # Get X and y matrices
    y = plays_df_clean["TARGET"]
    X = plays_df_clean.drop(["TARGET"], axis = 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)
    
    return X_train, X_test, y_train, y_test

In [6]:
# Helper function that does cross validation and gives best model
def run_cv(model, param_grid, X_train, y_train, X_val):
    print("training " + type(model).__name__)

    # Define the cross-validation strategy
    cv = KFold(n_splits=5)

    # Perform grid search with cross-validation
    start_time = time.time()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='f1_weighted')
    train_time = time.time() - start_time
    
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_val)

    # Return the best model, y_pred
    return grid_search, y_pred, train_time # return the metric and model

In [9]:
# Helper fucntions that do cross validation

def run_lasso(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(Lasso(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_ridge(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(Ridge(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_elastic_net(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2],
                  'l1_ratio': [0.1, 0.25, 0.5, 0.75, 0.9]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(ElasticNet(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_SVR(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'C': [0.01, 0.1, 1, 2, 10],
                  'kernel': ['linear', 'poly', 'rbf']}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(SVR(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_random_forest_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [100, 500, 1000],
                  'max_depth': [100, None]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(RandomForestRegressor(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_adaboost_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [50, 100, 200],
                  'learning_rate': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(AdaBoostRegressor(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_xgb_reg(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'max_depth': [3, 5, 6, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'subsample': [0.5, 0.7, 1]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(XGBRegressor(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_xgb_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'max_depth': [3, 5, 6, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'subsample': [0.5, 0.7, 1]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(XGBClassifier(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_logistic_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid for regularization strength
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet', None]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(LogisticRegression(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_SVC(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'C': [0.01, 0.1, 1, 2, 10],
                  'kernel': ['linear', 'poly', 'rbf']}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(SVC(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_random_forest_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [100, 500, 1000],
                  'max_depth': [100, None]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(RandomForestClassifier(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_adaboost_classifier(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'n_estimators': [50, 100, 200],
                  'learning_rate': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(AdaBoostClassifier(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time


def run_gaussianNB(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 2]}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(GaussianNB(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

def run_perceptron(X_train, y_train, X_test):
    # Define the hyperparameter grid 
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet']}

    # Get the best model and predictions
    grid_search, y_pred, train_time = run_cv(Perceptron(), param_grid, X_train, y_train, X_test)
    return grid_search, y_pred, train_time

# Linear regression (no tuning necessary)
def run_linear_reg(X_train, y_train, X_test):
    # Train model
    model = LinearRegression()
    print("training " + type(model).__name__)

    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    # Make predictions
    y_pred = model.predict(X_test)

    return model, y_pred, train_time

## Step 3: Train models

In [10]:
include_nfl_features_params = [True, False]
bin_ouput_params = [True, False]

results_df = pd.DataFrame(columns = ['model', 'regression/classification', 'train_time',
                                                'MSE pre-bin', 'r2_score',
                                                'bin_output', 'include_nfl_features',
                                                'f1_score','confusion_matrix', 'accuracy_score'])

regression_models = [run_linear_reg, run_lasso, run_ridge, run_elastic_net, run_SVR, run_random_forest_reg, run_adaboost_reg, run_xgb_reg]
classification_models = [run_logistic_classifier, run_perceptron, run_gaussianNB, run_SVC, run_random_forest_classifier, run_adaboost_classifier, run_xgb_classifier]

for include_nfl_features in include_nfl_features_params:
    for bin_output in bin_ouput_params: 
        # Prepreocessing 
        plays_df_clean = preprocess_plays_df_naive_models(plays_df, games_df, include_nfl_features, bin_output)

        # Train test split
        X_train, X_test, y_train, y_test = plays_train_test_split(plays_df_clean)

        # Check if we need to do regression first
        if not bin_output:
            for model_class in regression_models:
                # Train model
                model, y_pred, train_time = model_class(X_train, y_train, X_test)

                # Get accuracy
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                # Bin both and get post-binned metrics
                bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
                y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
                y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

                f1_metric = f1_score(y_test_binned, y_pred_binned, average = 'weighted')
                confusion_mat = confusion_matrix(y_test_binned, y_pred_binned)
                accuracy = accuracy_score(y_test_binned, y_pred_binned)

                # Record result
                new_row = pd.DataFrame({
                    'model': [type(model).__name__],
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [mse], 
                    'r2_score': [r2],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)

        else:
            for model_class in classification_models:
                # Train model
                model, y_pred, train_time = model_class(X_train, y_train, X_test)

                # Get accuracy metrics
                f1_metric = f1_score(y_test, y_pred, average = 'weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                accuracy = accuracy_score(y_test, y_pred)

                # Record result
                new_row = pd.DataFrame({
                    'model': [type(model).__name__],
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [np.nan], 
                    'r2_score': [np.nan],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)


results_df.head()

final plays data shape: (6840, 289)
training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

training Perceptron


## Step 4: Get best model

In [None]:
results_df.sort_values(by = 'f1_score', ascending = False)

Unnamed: 0,model,regression/classification,train_time,MSE pre-bin,r2_score,bin_output,include_nfl_features,f1_score,confusion_matrix,accuracy_score
27,RandomForestRegressor,classification,15.563212,45.89559,-0.0769846,False,False,0.222991,"[[0, 0, 2, 7, 49, 31, 1], [0, 1, 11, 16, 100, ...",0.283041
6,XGBClassifier,classification,3.167168,,,True,True,0.22258,"[[0, 10, 7, 4, 48, 17, 4], [1, 24, 10, 23, 99,...",0.265497
21,XGBClassifier,classification,1.765484,,,True,False,0.222254,"[[0, 9, 6, 8, 52, 7, 8], [1, 20, 17, 13, 103, ...",0.269591
14,XGBRegressor,classification,0.570791,46.43932,-0.08974371,False,True,0.219132,"[[0, 2, 3, 8, 44, 33, 0], [1, 6, 11, 14, 100, ...",0.269006
29,XGBRegressor,classification,0.493404,47.69684,-0.1192527,False,False,0.217885,"[[1, 0, 5, 10, 42, 30, 2], [1, 9, 4, 20, 97, 4...",0.270175
20,AdaBoostClassifier,classification,0.876631,,,True,False,0.217713,"[[1, 7, 7, 3, 64, 7, 1], [0, 18, 14, 7, 131, 1...",0.296491
4,RandomForestClassifier,classification,2.198357,,,True,True,0.217139,"[[2, 9, 6, 2, 56, 14, 1], [0, 19, 16, 13, 113,...",0.276023
19,RandomForestClassifier,classification,1.967382,,,True,False,0.216091,"[[2, 7, 8, 4, 48, 18, 3], [0, 19, 19, 19, 99, ...",0.267836
12,RandomForestRegressor,classification,19.386551,44.23136,-0.03793183,False,True,0.213482,"[[0, 1, 3, 8, 46, 31, 1], [0, 2, 9, 17, 95, 59...",0.276023
5,AdaBoostClassifier,classification,0.963426,,,True,True,0.213334,"[[1, 8, 2, 2, 69, 6, 2], [0, 19, 11, 7, 136, 1...",0.298246


## Depreciated - run through on one dataset/model

In [None]:
# Get X and y matrices
y = plays_df_clean["TARGET"]
X = plays_df_clean.drop(["TARGET"], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)

print('X_train shape : ', X_train.shape)
print('y_train shape : ', y_train.shape)

print('X_test shape  : ', X_test.shape)
print('y_test shape  : ', y_test.shape)

X_train shape :  (5127, 348)
y_train shape :  (5127,)
X_test shape  :  (1710, 348)
y_test shape  :  (1710,)


In [None]:
start_time = time.time()
model = RandomForestRegressor()
model.fit(X = X_train, y = y_train)
print("training time: " + str(time.time() - start_time))

training time: 15.882904052734375


In [None]:
type(model).__name__

'RandomForestRegressor'

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("MSE: \n" + str(mean_squared_error(y_test, y_pred)))

MSE: 
45.21479333333333


In [None]:
bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

In [None]:
print("Confusion matrix: \n" + str(confusion_matrix(y_test_binned, y_pred_binned)))
print("F1 score: " + str(round(f1_score(y_test_binned, y_pred_binned, average='weighted'), 3)))
print("Accuracy score: " + str(round(accuracy_score(y_test_binned, y_pred_binned), 3)))

Confusion matrix: 
[[  0   0   2   7  48  32   1]
 [  0   2  12  16 103  47   4]
 [  0   3  12  19  92  50   9]
 [  0   2   4  18 136  58   7]
 [  0   0   0  29 292 164  16]
 [  0   0   0  10 165 139  11]
 [  0   0   0   8  95  87  10]]
F1 score: 0.216
Accuracy score: 0.277
