# Naive Model (Play-Level Data) v2

__Date:__ 11/5/2023 <br>
__Purpose:__ Program that uses the play-level data to predict expected yards gained <br>
__Model and data specifications:__
- Data: Plays dataframe and some stuff from games df (no outside supplemental data)
- Models: Basic supervised learning 
<br>__Updates from previous version:__ More efficient design of hyperparameter tuning

## Step 0: Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, roc_curve, RocCurveDisplay, r2_score
import time 
import sys
sys.path.append('../preprocessing')
from Preprocessing_v3 import *
from DataLoader import load_data

# Regression models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor, XGBClassifier

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


## Step 1: Load Data

In [3]:
# Load data
[games_df, players_df, plays_df, tracking_df] = load_data()

loaded games df
shape: (136, 9)
-----
loaded players df
shape: (1683, 7)
-----
loaded plays df
shape: (12486, 35)
-----
loading tracking frames...
loaded tracking frames
shape: (12187398, 17)
returning 4 frames


## Step 2: Define Helper Functions

In [4]:
# Function that finishes preprocessing and does the train test split of plays df
def plays_train_test_split(plays_df_clean):
    # Drop game and play ID
    plays_df_clean = plays_df_clean.drop(['gameId', 'playId'], axis = 1)
    
    # Get X and y matrices
    y = plays_df_clean["TARGET"]
    X = plays_df_clean.drop(["TARGET"], axis = 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)
    
    return X_train, X_test, y_train, y_test

In [5]:
# Helper function that does cross validation and gives best model
def run_cv(model, param_grid, X_train, y_train, X_val):
    print("training " + str(model))

    # Define the cross-validation strategy
    cv = KFold(n_splits=5)

    # Get the type of scoring for the grid search depending on regression or classification
    if model in [LinearRegression, Lasso, Ridge, ElasticNet, SVR, RandomForestRegressor, AdaBoostRegressor, XGBRegressor]:
        scoring_metric = 'neg_mean_squared_error'
    else:
        scoring_metric = 'f1_weighted'

    # Perform grid search with cross-validation
    start_time = time.time()
    grid_search = GridSearchCV(estimator=model(), param_grid=param_grid, cv=cv, scoring=scoring_metric)
    train_time = time.time() - start_time
    
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_val)

    # Return the best model, y_pred
    return grid_search, y_pred, train_time # return the metric and model

## Step 3: Train models

In [6]:
# Dictionary of models and parameters
regression_models = {LinearRegression : {},
                     Lasso : {'alpha': [0.001, 0.01, 0.1, 1, 2]},
                     Ridge :  {'alpha': [0.001, 0.01, 0.1, 1, 2]},
                     ElasticNet : {'alpha': [0.001, 0.01, 0.1, 1, 2], 
                                   'l1_ratio': [0.1, 0.25, 0.5, 0.75, 0.9]},
                     # SVR : {'C': [0.01, 0.1, 1, 2, 10], 'kernel': ['linear', 'poly', 'rbf']},
                     RandomForestRegressor : {'n_estimators': [100, 500, 1000],
                                              'max_depth': [100, None]},
                     AdaBoostRegressor : {'n_estimators': [50, 100, 200],
                                          'learning_rate': [0.001, 0.01, 0.1, 1, 2]},
                     XGBRegressor : {'max_depth': [3, 5, 6, 7], 
                                     'learning_rate': [0.1, 0.3, 0.5], 
                                     'subsample': [0.5, 0.7, 1]}                    
}

classification_models = {
    # SVR : {'C': [0.01, 0.1, 1, 2, 10], 'kernel': ['linear', 'poly', 'rbf']},
    RandomForestClassifier : {'n_estimators': [100, 500, 1000],
                            'max_depth': [100, None]},
    AdaBoostClassifier : {'n_estimators': [50, 100, 200],
                        'learning_rate': [0.001, 0.01, 0.1, 1, 2]},
    XGBClassifier : {'max_depth': [3, 5, 6, 7], 
                    'learning_rate': [0.1, 0.3, 0.5], 
                    'subsample': [0.5, 0.7, 1]} ,
    LogisticRegression : {'penalty': ['l1', 'l2', 'elasticnet', None]},
    GaussianNB : {},
    Perceptron : {'penalty': ['l1', 'l2', 'elasticnet']}
}

In [7]:
include_nfl_features_params = [True, False]
bin_ouput_params = [True, False]

results_df = pd.DataFrame(columns = ['model', 'regression/classification', 'train_time',
                                                'MSE pre-bin', 'r2_score',
                                                'bin_output', 'include_nfl_features',
                                                'f1_score','confusion_matrix', 'accuracy_score'])
start_time = time.time()
for include_nfl_features in include_nfl_features_params:
    for bin_output in bin_ouput_params: 
        # Prepreocessing 
        plays_df_clean = preprocess_plays_df_naive_models(plays_df, games_df, include_nfl_features, bin_output)

        # Train test split
        X_train, X_test, y_train, y_test = plays_train_test_split(plays_df_clean)

        # Check if we need to do regression first
        if not bin_output:
            for model_class in regression_models.keys():
                # Train model
                model, y_pred, train_time = run_cv(model_class, regression_models[model_class], X_train, y_train, X_test)

                # Get regression accuracy
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                # Bin both and get post-binned metrics
                bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
                y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
                y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

                f1_metric = f1_score(y_test_binned, y_pred_binned, average = 'weighted')
                confusion_mat = confusion_matrix(y_test_binned, y_pred_binned)
                accuracy = accuracy_score(y_test_binned, y_pred_binned)

                # Record result
                new_row = pd.DataFrame({
                    'model': [str(model.best_estimator_)],
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [mse], 
                    'r2_score': [r2],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)

        else:
            for model_class in classification_models.keys():
                # Train model
                model, y_pred, train_time = run_cv(model_class, classification_models[model_class], X_train, y_train, X_test)

                # Get accuracy metrics
                f1_metric = f1_score(y_test, y_pred, average = 'weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                accuracy = accuracy_score(y_test, y_pred)

                # Record result
                new_row = pd.DataFrame({
                    'model': str(model.best_estimator_),
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [np.nan], 
                    'r2_score': [np.nan],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)

print("total time: " + str(time.time() - start_time))
results_df.head()

final plays data shape: (6840, 289)
training <class 'sklearn.ensemble._forest.RandomForestClassifier'>
training <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training <class 'xgboost.sklearn.XGBClassifier'>


## Step 4: Get best model

In [None]:
results_df.sort_values(by = 'f1_score', ascending = False).head()

Unnamed: 0,model,regression/classification,train_time,MSE pre-bin,r2_score,bin_output,include_nfl_features,f1_score,confusion_matrix,accuracy_score
1,"AdaBoostClassifier(learning_rate=1, n_estimato...",classification,5.5e-05,,,True,True,0.227863,"[[6, 8, 1, 3, 61, 7, 4], [6, 16, 7, 12, 117, 2...",0.283041
13,RandomForestClassifier(max_depth=100),classification,9e-05,,,True,False,0.222277,"[[3, 5, 10, 5, 53, 11, 3], [0, 20, 14, 15, 100...",0.272515
0,RandomForestClassifier(),classification,8.8e-05,,,True,True,0.219749,"[[2, 6, 9, 3, 55, 14, 1], [0, 16, 16, 15, 106,...",0.277193
15,"XGBClassifier(base_score=None, booster=None, c...",classification,2.7e-05,,,True,False,0.218626,"[[0, 6, 9, 9, 48, 16, 2], [3, 23, 19, 15, 95, ...",0.255556
2,"XGBClassifier(base_score=None, booster=None, c...",classification,3.6e-05,,,True,True,0.218369,"[[3, 8, 10, 4, 42, 18, 5], [3, 25, 16, 19, 83,...",0.246784


In [None]:
results_df.sort_values(by = 'f1_score', ascending = False).iloc[0]['model']

## Depreciated - run through on one dataset/model

In [None]:
# # Get X and y matrices
# y = plays_df_clean["TARGET"]
# X = plays_df_clean.drop(["TARGET"], axis = 1)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)

# print('X_train shape : ', X_train.shape)
# print('y_train shape : ', y_train.shape)

# print('X_test shape  : ', X_test.shape)
# print('y_test shape  : ', y_test.shape)

In [None]:
# model = LinearRegression()

# model.__class__

# # Get the type of scoring for the grid search depending on regression or classification
# if model.__class__ in [LinearRegression, Lasso, Ridge, ElasticNet, SVR, RandomForestRegressor, AdaBoostRegressor, XGBRegressor]:
#     scoring_metric = 'neg_mean_squared_error'
# else:
#     scoring_metric = 'f1_weighted'

# print(scoring_metric)

In [None]:
# # Train model

    

# grid_search = GridSearchCV(estimator=LinearRegression(), param_grid={}, cv=KFold(5), scoring='neg_mean_squared_error')

# grid_search.fit(X_train, y_train)
# y_pred = grid_search.predict(X_test)


# # Get accuracy
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Bin both and get post-binned metrics
# bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
# y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
# y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

# f1_metric = f1_score(y_test_binned, y_pred_binned, average = 'weighted')
# confusion_mat = confusion_matrix(y_test_binned, y_pred_binned)
# accuracy = accuracy_score(y_test_binned, y_pred_binned)

# # Record result
# new_row = pd.DataFrame({
#     'model': [str(model.best_estimator_)],
#     'regression/classification': ['classification'], 
#     'train_time': [train_time],
#     'MSE pre-bin': [mse], 
#     'r2_score': [r2],
#     'bin_output': [bin_output], 
#     'include_nfl_features': [include_nfl_features],
#     'f1_score': [f1_metric],
#     'confusion_matrix': [confusion_mat], 
#     'accuracy_score': [accuracy]
# })
# print(new_row)

In [None]:
# # Prepreocessing 
# plays_df_clean = preprocess_plays_df_naive_models(plays_df, games_df, True, False)

# # Train test split
# X_train, X_test, y_train, y_test = plays_train_test_split(plays_df_clean)

# start_time = time.time()
# model = XGBRegressor(learning_rate=1, n_estimators=200)
# model.fit(X = X_train, y = y_train)
# print("training time: " + str(time.time() - start_time))

In [None]:
# y_pred = model.predict(X_test)

In [None]:
# print("Confusion matrix: \n" + str(confusion_matrix(y_test, y_pred)))
# print("F1 score: " + str(round(f1_score(y_test, y_pred, average='weighted'), 3)))
# print("Accuracy score: " + str(round(accuracy_score(y_test, y_pred), 3)))

In [None]:
# print("MSE: \n" + str(mean_squared_error(y_test, y_pred)))

In [None]:
# bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
# y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
# y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

In [None]:
# print("Confusion matrix: \n" + str(confusion_matrix(y_test_binned, y_pred_binned)))
# print("F1 score: " + str(round(f1_score(y_test_binned, y_pred_binned, average='weighted'), 3)))
# print("Accuracy score: " + str(round(accuracy_score(y_test_binned, y_pred_binned), 3)))