# Naive Model (Play-Level Data) v0 

__Date:__ 11/5/2023 <br>
__Purpose:__ Program that uses the play-level data to predict expected yards gained <br>
__Model and data specifications:__
- Data: Plays dataframe and some stuff from games df (no outside supplemental data)
- Models: Basic supervised learning

## Step 0: Import Libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, roc_curve, RocCurveDisplay, r2_score
import time 
import sys
sys.path.append('../preprocessing')
from Preprocessing_v1 import *
from DataLoader import load_data

# Regression models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


## Step 1: Load Data

In [2]:
# Load data
[games_df, players_df, plays_df, tracking_df] = load_data()

loaded games df
shape: (136, 9)
-----
loaded players df
shape: (1683, 7)
-----
loaded plays df
shape: (12486, 35)
-----
loading tracking frames...
loaded tracking frames
shape: (12187398, 17)
returning 4 frames


## Step 2: Train models

In [3]:
# Function that finishes preprocessing and does the train test split of plays df
def plays_train_test_split(plays_df_clean):
    # Drop game and play ID
    plays_df_clean = plays_df_clean.drop(['gameId', 'playId'], axis = 1)
    
    # Get X and y matrices
    y = plays_df_clean["TARGET"]
    X = plays_df_clean.drop(["TARGET"], axis = 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)

    # print('X_train shape : ', X_train.shape)
    # print('y_train shape : ', y_train.shape)

    # print('X_test shape  : ', X_test.shape)
    # print('y_test shape  : ', y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [10]:
include_nfl_features_params = [True, False]
bin_ouput_params = [True, False]

results_df = pd.DataFrame(columns = ['model', 'regression/classification', 'train_time',
                                                'MSE pre-bin', 'r2_score',
                                                'bin_output', 'include_nfl_features',
                                                'f1_score','confusion_matrix', 'accuracy_score'])

regression_models = [LinearRegression, Lasso, Ridge, ElasticNet, SVR, RandomForestRegressor, AdaBoostRegressor]
classification_models = [LogisticRegression, Perceptron, SVC, GaussianNB, RandomForestClassifier, AdaBoostClassifier]

for include_nfl_features in include_nfl_features_params:
    for bin_output in bin_ouput_params: 
        # Prepreocessing 
        plays_df_clean = preprocess_plays_df_naive_models(plays_df, games_df, include_nfl_features, bin_output)

        # Train test split
        X_train, X_test, y_train, y_test = plays_train_test_split(plays_df_clean)

        # Check if we need to do regression first
        if not bin_output:
            for model_class in regression_models:
                # Train model
                model = model_class()
                print("training " + type(model).__name__)

                start_time = time.time()
                model.fit(X_train, y_train)
                train_time = time.time() - start_time

                # Make predictions
                y_pred = model.predict(X_test)

                # Get accuracy
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                # Bin both and get post-binned metrics
                bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
                y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
                y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

                f1_metric = f1_score(y_test_binned, y_pred_binned, average = 'weighted')
                confusion_mat = confusion_matrix(y_test_binned, y_pred_binned)
                accuracy = accuracy_score(y_test_binned, y_pred_binned)

                # Record result
                new_row = pd.DataFrame({
                    'model': [type(model).__name__],
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [mse], 
                    'r2_score': [r2],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)

        else:
            for model_class in classification_models:
                # Train model
                model = model_class()
                print("training " + type(model).__name__)

                start_time = time.time()
                model.fit(X_train, y_train)
                train_time = time.time() - start_time

                # Make predictions
                y_pred = model.predict(X_test)

                # Get accuracy metrics
                f1_metric = f1_score(y_test, y_pred, average = 'weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                accuracy = accuracy_score(y_test, y_pred)

                # Record result
                new_row = pd.DataFrame({
                    'model': [type(model).__name__],
                    'regression/classification': ['classification'], 
                    'train_time': [train_time],
                    'MSE pre-bin': [np.nan], 
                    'r2_score': [np.nan],
                    'bin_output': [bin_output], 
                    'include_nfl_features': [include_nfl_features],
                    'f1_score': [f1_metric],
                    'confusion_matrix': [confusion_mat], 
                    'accuracy_score': [accuracy]
                })
                results_df = pd.concat([results_df, new_row], ignore_index=True)


results_df.head()

final plays data shape: (6840, 289)
training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training Perceptron


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training SVC


  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)


training GaussianNB
training RandomForestClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training AdaBoostClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


final plays data shape: (6840, 289)
training LinearRegression


  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)


training Lasso
training Ridge
training ElasticNet
training SVR


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training RandomForestRegressor


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training AdaBoostRegressor


  results_df = pd.concat([results_df, new_row], ignore_index=True)


final plays data shape: (6840, 289)
training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  results_df = pd.concat([results_df, new_row], ignore_index=True)


training Perceptron


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training SVC


  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)


training GaussianNB
training RandomForestClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training AdaBoostClassifier


  results_df = pd.concat([results_df, new_row], ignore_index=True)


final plays data shape: (6840, 289)
training LinearRegression
training Lasso


  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = pd.concat([results_df, new_row], ignore_index=True)


training Ridge
training ElasticNet
training SVR


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training RandomForestRegressor


  results_df = pd.concat([results_df, new_row], ignore_index=True)


training AdaBoostRegressor


  results_df = pd.concat([results_df, new_row], ignore_index=True)


Unnamed: 0,model,regression/classification,train_time,MSE pre-bin,r2_score,bin_output,include_nfl_features,f1_score,confusion_matrix,accuracy_score
0,LogisticRegression,classification,0.621485,,,True,True,0.132776,"[[0, 0, 0, 0, 90, 0, 0], [0, 0, 0, 0, 184, 0, ...",0.292982
1,Perceptron,classification,0.353455,,,True,True,0.132776,"[[0, 0, 0, 0, 90, 0, 0], [0, 0, 0, 0, 184, 0, ...",0.292982
2,SVC,classification,7.149053,,,True,True,0.132776,"[[0, 0, 0, 0, 90, 0, 0], [0, 0, 0, 0, 184, 0, ...",0.292982
3,GaussianNB,classification,0.034382,,,True,True,0.183787,"[[24, 14, 8, 4, 16, 11, 13], [37, 41, 28, 11, ...",0.179532
4,RandomForestClassifier,classification,2.083192,,,True,True,0.214815,"[[2, 7, 8, 2, 55, 13, 3], [0, 22, 13, 13, 113,...",0.271345


## Step 3: Get best model

In [12]:
results_df.sort_values(by = 'f1_score', ascending = False)

Unnamed: 0,model,regression/classification,train_time,MSE pre-bin,r2_score,bin_output,include_nfl_features,f1_score,confusion_matrix,accuracy_score
17,RandomForestClassifier,classification,1.948296,,,True,False,0.218807,"[[3, 6, 8, 5, 53, 12, 3], [1, 21, 16, 18, 98, ...",0.265497
18,AdaBoostClassifier,classification,0.825116,,,True,False,0.217713,"[[1, 7, 7, 3, 64, 7, 1], [0, 18, 14, 7, 131, 1...",0.296491
4,RandomForestClassifier,classification,2.083192,,,True,True,0.214815,"[[2, 7, 8, 2, 55, 13, 3], [0, 22, 13, 13, 113,...",0.271345
5,AdaBoostClassifier,classification,0.953678,,,True,True,0.213334,"[[1, 8, 2, 2, 69, 6, 2], [0, 19, 11, 7, 136, 1...",0.298246
24,RandomForestRegressor,classification,15.790036,46.46909,-0.09044223,False,False,0.211944,"[[0, 0, 2, 6, 52, 30, 0], [0, 1, 10, 17, 102, ...",0.269006
11,RandomForestRegressor,classification,18.688377,44.29121,-0.03933626,False,True,0.211279,"[[0, 1, 4, 4, 52, 29, 0], [0, 1, 10, 16, 98, 5...",0.273684
6,LinearRegression,classification,0.123169,61908570000000.0,-1452745000000.0,False,True,0.209398,"[[1, 2, 4, 7, 46, 28, 2], [3, 2, 3, 25, 88, 61...",0.269591
8,Ridge,classification,0.040088,42.36483,0.005868126,False,True,0.205164,"[[0, 2, 4, 3, 52, 29, 0], [1, 2, 2, 25, 89, 64...",0.268421
19,LinearRegression,classification,0.116761,4461528000000.0,-104694100000.0,False,False,0.193943,"[[0, 1, 3, 8, 43, 32, 3], [3, 2, 2, 19, 103, 5...",0.255556
21,Ridge,classification,0.037083,42.83058,-0.005061226,False,False,0.191236,"[[0, 1, 4, 7, 45, 33, 0], [0, 3, 0, 20, 102, 5...",0.254386


## Depreciated - run through on one dataset/model

In [None]:
# Get X and y matrices
y = plays_df_clean["TARGET"]
X = plays_df_clean.drop(["TARGET"], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)

print('X_train shape : ', X_train.shape)
print('y_train shape : ', y_train.shape)

print('X_test shape  : ', X_test.shape)
print('y_test shape  : ', y_test.shape)

X_train shape :  (5127, 348)
y_train shape :  (5127,)
X_test shape  :  (1710, 348)
y_test shape  :  (1710,)


In [None]:
start_time = time.time()
model = RandomForestRegressor()
model.fit(X = X_train, y = y_train)
print("training time: " + str(time.time() - start_time))

training time: 15.882904052734375


In [None]:
type(model).__name__

'RandomForestRegressor'

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("MSE: \n" + str(mean_squared_error(y_test, y_pred)))

MSE: 
45.21479333333333


In [None]:
bins = [float('-inf'), -2, 0, 1, 2.5, 5, 10, float('inf')]
y_pred_binned = pd.cut(y_pred, bins = bins, labels = range(len(bins) - 1))
y_test_binned = pd.cut(y_test, bins = bins, labels = range(len(bins) - 1))

In [None]:
print("Confusion matrix: \n" + str(confusion_matrix(y_test_binned, y_pred_binned)))
print("F1 score: " + str(round(f1_score(y_test_binned, y_pred_binned, average='weighted'), 3)))
print("Accuracy score: " + str(round(accuracy_score(y_test_binned, y_pred_binned), 3)))

Confusion matrix: 
[[  0   0   2   7  48  32   1]
 [  0   2  12  16 103  47   4]
 [  0   3  12  19  92  50   9]
 [  0   2   4  18 136  58   7]
 [  0   0   0  29 292 164  16]
 [  0   0   0  10 165 139  11]
 [  0   0   0   8  95  87  10]]
F1 score: 0.216
Accuracy score: 0.277
