In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import acquire_plays_data
import prep_plays
import wrangle_plays_data
import re
plt.rc("figure", figsize=(12, 7))
plt.rc("font", size=14)

Acquire.py Loaded Successfully
Prep.py Loaded Successfully
Wrangle.py Loaded Successfully


In [2]:
df = acquire_plays_data.get_plays_data()

In [3]:
df = prep_plays.prep_plays_data()

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_plays_data.train_validate_test(df)

In [5]:
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [6]:
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle_plays_data.min_max_scale(X_train, X_validate, X_test)

In [7]:
def rfe_ranker(X_train_scaled, y_train, k):
    '''
    Uses Recursive Feature Elimination (RFE) to rank the given features in order of their usefulness in
    predicting a win with a linear regression model.
    '''
    # creating linear regression object
    lm = LinearRegression()

    # fitting linear regression model to features 
    lm.fit(X_train_scaled, y_train)

    # creating recursive feature elimination object and specifying to rank 5 of the best features
    rfe = RFE(lm, k)

    # using rfe object to transform features 
    x_rfe = rfe.fit_transform(X_train_scaled, y_train)

    feature_mask = rfe.support_

    # creating train df for rfe object 
    rfe_train = X_train_scaled

    # creating list of the top features per rfe
    rfe_features = rfe_train.loc[:,feature_mask].columns.tolist()

    # creating ranked list 
    feature_ranks = rfe.ranking_

    # creating list of feature names
    feature_names = rfe_train.columns.tolist()

    # create df that contains all features and their ranks
    rfe_ranks_df = pd.DataFrame({'Feature': feature_names, 'Rank': feature_ranks})

    # return df sorted by rank
    return rfe_ranks_df.sort_values('Rank')

In [8]:
X_train.T

Unnamed: 0,12331,9633,13947,15562,5458,9060,7240,11494,3899,4911,...,13238,4813,15921,5634,7271,14217,15210,7739,14947,641
quarter,4.0,4.0,4.0,3.0,4.0,3.0,2.0,3.0,4.0,4.0,...,1.0,3.0,1.0,2.0,1.0,4.0,1.0,1.0,4.0,4.0
down,1.0,3.0,1.0,2.0,1.0,3.0,3.0,1.0,1.0,1.0,...,2.0,2.0,3.0,1.0,2.0,1.0,2.0,3.0,2.0,2.0
yardsToGo,15.0,9.0,10.0,10.0,10.0,7.0,10.0,10.0,10.0,10.0,...,9.0,8.0,4.0,10.0,17.0,10.0,10.0,8.0,1.0,10.0
team_by_comp_yds,5.0,31.0,14.0,24.0,10.0,7.0,17.0,17.0,30.0,21.0,...,9.0,20.0,28.0,24.0,10.0,20.0,31.0,1.0,15.0,32.0
defendersInTheBox,5.0,6.0,5.0,5.0,6.0,6.0,5.0,7.0,6.0,6.0,...,7.0,6.0,6.0,6.0,4.0,7.0,6.0,5.0,6.0,6.0
numberOfPassRushers,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,6.0,4.0,...,4.0,5.0,4.0,5.0,4.0,5.0,4.0,5.0,5.0,4.0
QB_under_pressure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
absoluteYardlineNumber,65.0,63.0,75.0,31.0,50.0,106.0,20.0,94.0,70.0,76.0,...,27.0,71.0,65.0,66.0,59.0,40.0,70.0,74.0,39.0,50.0
epa,-0.434026,-1.64332,-0.22306,-0.755375,-0.507168,-1.03667,3.39622,-0.547175,0.113275,-0.0955019,...,0.906769,-0.707462,1.92822,0.0986205,-0.382263,-0.251191,0.973164,-1.71468,-0.39161,-0.334711
playResult,0.0,0.0,0.0,-4.0,0.0,0.0,10.0,0.0,6.0,6.0,...,10.0,0.0,8.0,5.0,6.0,3.0,11.0,0.0,0.0,6.0


In [9]:
X_train.QB_under_pressure.value_counts()

0.0    6449
1.0     691
Name: QB_under_pressure, dtype: int64

In [10]:
y_train

Unnamed: 0,pass_stopped
12331,1
9633,1
13947,1
15562,0
5458,1
...,...
14217,0
15210,0
7739,1
14947,1


In [11]:
rfe_ranker(X_train_scaled, y_train, k=8)

Unnamed: 0,Feature,Rank
1,yardsToGo,1
2,defendersInTheBox,1
3,numberOfPassRushers,1
4,QB_under_pressure,1
5,epa,1
6,DL,1
7,DB,1
8,four_three,1
0,quarter,2


In [12]:
y_train

Unnamed: 0,pass_stopped
12331,1
9633,1
13947,1
15562,0
5458,1
...,...
14217,0
15210,0
7739,1
14947,1


In [13]:
X_train_scaled = X_train_scaled[['yardsToGo','defendersInTheBox','numberOfPassRushers','QB_under_pressure','epa','DL','DB']]
X_validate_scaled = X_validate_scaled[['yardsToGo','defendersInTheBox','numberOfPassRushers','QB_under_pressure','epa','DL','DB']]
X_test_scaled = X_test_scaled[['yardsToGo','defendersInTheBox','numberOfPassRushers','QB_under_pressure','epa','DL','DB']]

In [14]:
def knn(X_train_scaled, y_train, k):
    # KNN object
    knn = KNeighborsClassifier(n_neighbors=k, weights='uniform')

    # Fit the model
    knn = knn.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = knn.predict(X_train_scaled)

    # Estimate the probability
    y_pred_proba = knn.predict_proba(X_train_scaled)

    print('The k-neareast neighbor accuracy : {:.2f}\n'
                 .format(knn.score(X_train_scaled, y_train)))
    print(f'----------------------')
    # Confusion matrix
    #print(f'Confusion Matrix: \n\n {pd.crosstab(y_train, y_pred)}\n' )
    print(f'----------------------') 
    print("K-Nearest Neighbor Classification Report:\n", classification_report(y_train, y_pred))

    return knn

In [15]:
knn.score(X_train_scaled, y_train)

AttributeError: 'function' object has no attribute 'score'

In [16]:
print(X_train_scaled.shape)
print(y_train.shape)

(7140, 7)
(7140, 1)


In [17]:
knn(X_train_scaled, y_train, k=5)

The k-neareast neighbor accuracy : 0.87

----------------------
----------------------
K-Nearest Neighbor Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.90      4642
           1       0.79      0.84      0.82      2498

    accuracy                           0.87      7140
   macro avg       0.85      0.86      0.86      7140
weighted avg       0.87      0.87      0.87      7140



KNeighborsClassifier()

In [18]:
def knn_val(X_validate_scaled, y_validate, k):
    # KNN object
    knn = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    
    # Fit the model
    knn = knn.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = knn.predict(X_validate_scaled)

    # Estimate the probability
    y_pred_proba = knn.predict_proba(X_validate_scaled)

    print('The k-neareast neighbor accuracy : {:.2f}\n'
                 .format(knn.score(X_validate_scaled, y_validate)))
    print(f'----------------------')
    # Confusion matrix
    #print(f'Confusion Matrix: \n\n {pd.crosstab(y_train, y_pred)}\n' )
    print(f'----------------------') 
    print("K-Nearest Neighbor Classification Report:\n", classification_report(y_validate, y_pred))

    return knn_val

In [19]:
knn_val(X_validate_scaled, y_validate, k=5)

The k-neareast neighbor accuracy : 0.80

----------------------
----------------------
K-Nearest Neighbor Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.83      0.85      3095
           1       0.70      0.76      0.73      1665

    accuracy                           0.80      4760
   macro avg       0.78      0.79      0.79      4760
weighted avg       0.81      0.80      0.81      4760



<function __main__.knn_val(X_validate_scaled, y_validate, k)>

In [20]:
def logistic_regression(X_train_scaled, y_train):
    '''
    This function takes in X_train (features using for model) and y_train (target 'win') and performs logistic
    regression giving us accuracy of the model and the classification report
    '''
    # Calling out funtion
    logit = LogisticRegression()

    # Fit the training data set
    logit = logit.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = logit.predict(X_train_scaled)

    #Accuracy of model
    score = logit.score(X_train_scaled, y_train)

    print(f'The logistic regression models accuracy is {round(score * 100,2)}%\n')
    print(f'----------------------')     
    #print(f'Confusion Matrix\n\n {pd.crosstab(y_train, y_pred)}\n') 
    print(f'----------------------') 
    
    # Coefficients for each feature  
    coef_df = pd.DataFrame(logit.coef_)

    # List comprehension for columns in X_train
    names = [column for column in X_train_scaled.columns]
    
    # Renaming columns to their names for coef_df
    coef_df.columns = names

    print(f'Classification Report\n {classification_report(y_train, y_pred)}')
    return coef_df, logit

In [21]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [22]:
coef_df, logit = logistic_regression(X_train_scaled, y_train)

The logistic regression models accuracy is 80.1%

----------------------
----------------------
Classification Report
               precision    recall  f1-score   support

           0       0.80      0.92      0.86      4642
           1       0.79      0.58      0.67      2498

    accuracy                           0.80      7140
   macro avg       0.80      0.75      0.76      7140
weighted avg       0.80      0.80      0.79      7140



In [23]:
def logistic_regression_val(X_validate_scaled, y_validate):
    '''
    This function takes in X_train (features using for model) and y_train (target 'win') and performs logistic
    regression giving us accuracy of the model and the classification report
    '''
    # Calling out funtion
    logit = LogisticRegression()

    # Fit the training data set
    logit = logit.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = logit.predict(X_validate_scaled)

    #Accuracy of model
    score = logit.score(X_validate_scaled, y_validate)

    print(f'The logistic regression models accuracy is {round(score * 100,2)}%\n')
    print(f'----------------------')     
    #print(f'Confusion Matrix\n\n {pd.crosstab(y_train, y_pred)}\n') 
    print(f'----------------------') 
    
    # Coefficients for each feature  
    coef_df = pd.DataFrame(logit.coef_)

    # List comprehension for columns in X_train
    names = [column for column in X_validate_scaled.columns]
    
    # Renaming columns to their names fovalidatef_df
    coef_df.columns = names

    print(f'Classification Report\n {classification_report(y_validate, y_pred)}')
    return coef_df, logit

In [24]:
coef_df, logit = logistic_regression_val(X_validate_scaled, y_validate)

The logistic regression models accuracy is 78.89%

----------------------
----------------------
Classification Report
               precision    recall  f1-score   support

           0       0.80      0.90      0.85      3095
           1       0.76      0.58      0.66      1665

    accuracy                           0.79      4760
   macro avg       0.78      0.74      0.75      4760
weighted avg       0.79      0.79      0.78      4760



In [25]:
import MVP

In [26]:
# MVP model has a max depth of 15 and min of 8 leaf samples
MVP.MVP()

---------------------------- Train -------------------------------
Accuracy of random forest classifier on training set: 0.89
Training Data Matrix
[[4049  593]
 [ 162 2336]]
Training Data Report
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      4642
           1       0.80      0.94      0.86      2498

    accuracy                           0.89      7140
   macro avg       0.88      0.90      0.89      7140
weighted avg       0.90      0.89      0.90      7140

---------------------------- Validate -------------------------------
Accuracy of random forest classifier on validate set: 0.86
Training Data Matrix
[[2586  509]
 [ 148 1517]]
Training Data Report
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      3095
           1       0.75      0.91      0.82      1665

    accuracy                           0.86      4760
   macro avg       0.85      0.87      0.85      4760
weighted

(<function MVP.MVP()>,
                      importance
 epa                    0.855928
 yardsToGo              0.053956
 numberOfPassRushers    0.022476
 quarter                0.016584
 defendersInTheBox      0.015104
 QB_under_pressure      0.013480
 DL                     0.012036
 DB                     0.008220
 four_three             0.002215)