# NFL 2018 Defense Analyzer

# Importing Libraries

In [2]:
# acquire libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#explore libraries
from scipy import stats
import seaborn as sns
import prep_plays
import prep_season
plt.rc("figure", figsize=(12, 7))
plt.rc("font", size=14)
from sklearn.model_selection import train_test_split

# model libraries
import wrangle_plays_data
import prep_nfl
import wrangle_nfl
import MVP
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

pd.set_option('display.max_columns', None)

# Acquire 

- We acquired the data from kaggle.com as several .csv's but the data itself is provided by nextgenstats.nfl.com

In [None]:
nfl = pd.read_csv('plays.csv')

In [None]:
nfl.head()

In [None]:
nfl.shape

In [None]:
nfl.info()

In [None]:
nfl.describe().T

In [None]:
num_cols = nfl.columns[[(nfl[col].dtype == 'int64') | (nfl[col].dtype == 'float64') for col in nfl.columns]]
for col in num_cols:
    plt.hist(nfl[col])
    plt.title(col)
    plt.show()

**Takeaways:**
- There are some null values listed that will need some investigation
- More plays are being ran in the second and fourth quarter
- Less plays are being ran as the down gets greater
- Yards to go is skewed right(makes sense)
    - Less likely to lose yards than gain
- Most plays begin between home 20 and away 20
    - Hard to pin your opponent inside 20 for kickoff or punt
- Defenders in the box is a normal distribution
- Number of pass  rushers is a normal distribution
- Scores are skewed right
- Play result is skewed right slightly
- epa is fairly normal distribution

# Prepare

- Create a function that will acquire the plays.csv
- Keep only the useful columns that can help us determine the success of a defense(whether a pass was completed or not)
    - `playDescription`, `quarter`, `down`, `yardsToGo`, `possessionTeam`, `offenseFormation`, `personnelO`, `defendersInTheBox`, `numberOfPassRushers`, `personnelD`, `typeDropback`, `gameClock`, `absoluteYardlineNumber`, `epa`, `playType`, `passResult`, `playResult`
- Create a new column called `pass_stopped` 
    - Will change completion into 0
    - Will change incomplete and interception into 1
- Filter out data that is not a pass play(no fake punts, fake field goals, etc)
- Create new columns that extract positions from offensive personnel
    - RB, TE, WR
- Create new columns that extract positions from defensive personnel
    - DL, LB, DB
- Rename `typeDropback` to `QB_under_pressure` and change values into normal or scramble
- Rename `passResult` into `pass_stopped`
- Create formations out of personnel on the field

# Explore

In [None]:
train, validate, test = prep_plays.explore_plays_data()

In [None]:
train.T

In [None]:
alpha = .05

## Does the offense formation matter? i.e. (is a certain offensive formation harder to defend?)

- $H_0$: There is no dependence between offensive formation and pass stopped
- $H_a$: There is a dependence between offensive formation and pass stopped

In [None]:
observed = pd.crosstab(train.offenseFormation, train.pass_stopped)

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.catplot(x="offenseFormation", hue="pass_stopped", kind="count", data=train, height=8, aspect=2)._legend.remove()
plt.title('Do certain offensive formations have more more passes stopped than others?', size = 30)
plt.xlabel('Offensive Formation', size = 16)
plt.ylabel('Count', size = 20)
plt.legend(labels = ('Pass Completed', 'Pass Stopped'), loc='center right', frameon=False, fontsize='x-large')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Shotgun', 'Empty', 'Singleback', 'I Formation', 'Pistol', 'Jumbo', 'Wildcat'], size = 20)
plt.show()

**Takeaways:**
- There does not seem to be a certain formation that will have there pass stopped more than others
- After a statistical test, we can safely say that there is not dependence on stopping the play and the formation the offense is lined up in.

In [None]:
train.groupby('offenseFormation').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('offenseFormation').pass_stopped.count()

## Are passes stoped dependent on Down?

- $H_0$: There is no dependence between down and pass stopped
- $H_a$: There is a dependence between down and pass stopped

In [None]:
observed = pd.crosstab(train.down, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='down', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Down')
plt.ylabel('Pass Stopped %')
plt.title("Are Passes Stopped dependent on Down?")
plt.show()

**Takeaway:**
- There is a dependence between a pass being stopped and what down it is.
- more passes are stopped on 3rd down with 4th down right behind it


In [None]:
train.groupby('down').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('down').pass_stopped.count()

## Are EPA values dramatically different for passes stopped vs. passes completed?

- $H_0$: The EPA value is the same for passes completed and passes stopped
- $H_a$: The EPA value is different for passes completed and passes stopped

In [None]:
pass_completed = train[train.pass_stopped == 0]
pass_not_completed = train[train.pass_stopped == 1]

t, p = stats.ttest_ind(pass_completed.epa, pass_not_completed.epa)

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
plt.rc("figure", figsize=(10, 6))
sns.violinplot(train.pass_stopped, train.epa)
plt.xlabel('')
plt.xticks([0,1], ['Pass Completed', 'Pass Stopped'])
plt.yticks(size = 24)
plt.ylabel('EPA')
plt.title("Are Passes Stopped dependent on EPA?")
plt.show()

In [None]:
print(f"The EPA mean for passes completed is {pass_completed.epa.mean()}.")
print(f"The EPA minimum for passes completed is {pass_completed.epa.min()}.")
print(f"The EPA max for passes completed is {pass_completed.epa.max()}.")

In [None]:
print(f"The EPA mean for passes stopped is {pass_not_completed.epa.mean()}.")
print(f"The EPA minimum for passes stopped is {pass_not_completed.epa.min()}.")
print(f"The EPA max for passes stopped is {pass_not_completed.epa.max()}.")

**Takeaways:**
- On average the EPA is negative for passes stopped and the EPA is positive for passes completed
- The pass is usually stopped when the EPA is negative but not always.
- If the EPA is above 2.5 then it almost guarantees that the pass will be completed

## Are passes stopped dependent on QB pressure?

- $H_0$: There is no dependence between QB pressure and pass stopped
- $H_a$: There is a dependence between QB pressure and pass stopped

In [None]:
observed = pd.crosstab(train.QB_under_pressure, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='QB_under_pressure', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.xticks([0,1], ['No Pressure', 'Pressure Applied'])
plt.title("Are Passes Stopped dependent on Pressure Applied to QB?")
plt.show()

In [None]:
train.groupby('QB_under_pressure').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('QB_under_pressure').pass_stopped.count()

## Are passes stopped dependent on how many Defenders are in the Box?

- $H_0$: There is no dependence between defenders in the box and pass stopped
- $H_a$: There is a dependence between defenders in the box and pass stopped

In [None]:
observed = pd.crosstab(train.defendersInTheBox, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='defendersInTheBox', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Defenders in the Box')
plt.ylabel('Pass Stopped %')
plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on the number of Defenders in the Box?")
plt.show()

In [None]:
train.groupby('defendersInTheBox').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('defendersInTheBox').pass_stopped.count()

## Are passes stopped dependent on how many DL?

- $H_0$: There is no dependence between DL and pass stopped
- $H_a$: There is a dependence between DL and pass stopped

In [None]:
observed = pd.crosstab(train.DL, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='DL', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of DL')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on DL count?")
plt.show()

In [None]:
train.groupby('DL').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('DL').pass_stopped.count()

## Are passes stopped dependent on how many LB?

- $H_0$: There is no dependence between LB and pass stopped
- $H_a$: There is a dependence between LB and pass stopped

In [None]:
observed = pd.crosstab(train.LB, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='LB', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of LB')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on LB count?")
plt.show()

## Are passes stopped dependent on how many DB?

- $H_0$: There is no dependence between DB and pass stopped
- $H_a$: There is a dependence between DB and pass stopped

In [None]:
observed = pd.crosstab(train.DB, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='DB', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of DB')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped Dependent on DB count?")
plt.show()

In [None]:
train.groupby('DB').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('DB').pass_stopped.count()

## Are passes stopped dependent on how defensive formation(Nickel)?

- $H_0$: There is no dependence between Nickel formation and pass stopped
- $H_a$: There is a dependence between Nickel formation and pass stopped

In [None]:
observed = pd.crosstab(train.nickel, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='nickel', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the Nickle Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', 'Nickle Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(Dime)?


- $H_0$: There is no dependence between Dime formation and pass stopped
- $H_a$: There is a dependence between Dime formation and pass stopped

In [None]:
observed = pd.crosstab(train.dime, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='dime', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the Dime Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', 'Dime Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(4-3)?

- $H_0$: There is no dependence between 4-3 formation and pass stopped
- $H_a$: There is a dependence between 4-3 formation and pass stopped

In [None]:
observed = pd.crosstab(train.four_three, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='four_three', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the 4-3 Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', '4-3 Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(3-4)?

- $H_0$: There is no dependence between 3-4 formation and pass stopped
- $H_a$: There is a dependence between 3-4 formation and pass stopped

In [None]:
observed = pd.crosstab(train.three_four, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='three_four', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the 3-4 Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', '3-4 Formation'])
plt.show()

## What makes a defense good?

In [None]:
df = pd.read_csv('season.csv')
defensedf = df[df["position"].isin(["CB", "OLB", "SS","FS","ILB","DE","LB","MLB","S","DT","DL","DB"])]

In [None]:
plt.rcParams['figure.figsize']=(13,7)
posdf = defensedf.groupby('position')['event'].sum().reset_index()
#pos20 = posdf.sort_values(by='event', ascending=False)
posdf = posdf.sort_values(by=['event'], ascending =False)
#plt.grid()
sns.set_style("darkgrid")
sns.barplot(data=posdf, x='position', y= 'event', palette='mako')
sns.color_palette('Blues')
plt.title('Position and Incompletions', fontsize=13)
plt.xlabel('Defensive Position',fontsize=13)
plt.ylabel('Incomplete Passes',fontsize=13)
posdf

In [None]:
cbdf = defensedf[defensedf['position'] == 'CB']
olbdf = defensedf[defensedf['position'] == 'OLB']
ssdf = defensedf[defensedf['position'] == 'SS']
fsdf = defensedf[defensedf['position'] == 'FS']
ilbdf = defensedf[defensedf['position'] == 'ILB']

### Cornerback

In [None]:
prep_season.get_viz(cbdf)

### Outside Linebacker

In [None]:
prep_season.get_viz(olbdf)

### Strong Safety

In [None]:
prep_season.get_viz(ssdf)

### Free Safety

In [None]:
prep_season.get_viz(fsdf)

### Inside Linebacker

## Defensive Position Takeaways - Top 5 Attributes

In [None]:
prep_season.get_viz(ilbdf)

### CORNERBACK

**Ages:**
-    23, 25, 27, 26, 28

**Colleges:**
-    Ohio state, Florida state, lsu, Alabama, Florida

**Height:**
-    71", 72", 73", 70", 69"

**Weight:**
-    190lbs, 196lbs, 195lbs, 192lbs, 185lbs

### OUTSIDE LINEBACKER

**Ages:**
- 25, 27, 23, 28, 26

**College:**
- Georgia, Florida state, Southern California, Kentucky

**Height:**
- 75", 73", 76", 74", 77"

**Weight:**
- 250lbs, 255lbs, 265lbs, 240lbs, 235lbs

### STRONG SAFETY

**Ages:**
- 27, 24, 26, 30, 25

**College:**
- Ohio state, boston college, lsu, Georgia, Texas 

**Height:**
- 72",71",73",74",70"

**Weight:**
- 215lbs, 210lbs, 202lbs, 195lbs, 212lbs

### FREE SAFETY

**Ages:**
- 27, 25, 26, 22, 24

**College:**
- Utah, Rutgers, Alabama, South Carolina, ohio state

**Height:**
- 73", 71", 72", 70", 74"

**Weight:**
- 205lbs, 195lbs, 212lbs, 202lbs, 14lbs

### INSIDE LINEBACKER

**Ages:**
- 23, 28, 26, 24, 29

**College:**
- Kentucky, Alabama, Washington, Florida state,stanford

**Height:**
- 73", 72", 74", 75", 76"

**Weight:**
- 250lbs, 232lbs, 230lbs, 245lbs, 235lbs

## Who are the top defenders?

In [None]:
prep_season.top_defenders()

# Model

In [3]:
 df = prep_nfl.get_nfl_data()

Dataframe Ready For Use


In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_nfl.train_validate_test(df)
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle_nfl.min_max_scale(X_train, X_validate, X_test)
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle_nfl.add_clusters(X_train_scaled,
                                                                            X_validate_scaled, X_test_scaled,
                                                                            X_train,X_validate, X_test)

## Train

### Gradient Boost

In [5]:
# after running through several learning rates 
# from .0001 up to 100, 1 is the best parameter
boost_params = {'learning_rate': [1]}

In [6]:
#setting parameters and fitting model
search = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)
search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [1]})

In [7]:
#predicting target variable
y_pred = search.predict(X_train_scaled)

In [8]:
print('>>>>>>>>>> Accuracy of Gradient Boost on TRAIN set: {:.2f}'
     .format(search.score(X_train_scaled, y_train)))

>>>>>>>>>> Accuracy of Gradient Boost on TRAIN set: 0.96


In [9]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

Unnamed: 0,0,1
0,58680,1944
1,1530,32270


In [10]:
#classification report
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
class_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.974589,0.943181,0.963209,0.958885,0.963346
recall,0.967933,0.954734,0.963209,0.961334,0.963209
f1-score,0.97125,0.948922,0.963209,0.960086,0.963257
support,60624.0,33800.0,0.963209,94424.0,94424.0


### KNN

In [11]:
#setting parameters and fitting model
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier()

In [12]:
#predicting target variable
y_pred = knn.predict(X_train_scaled)

In [13]:
print('>>>>>>>>>> Accuracy of KNN classifier on TRAIN set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))

>>>>>>>>>> Accuracy of KNN classifier on TRAIN set: 0.82


In [14]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

Unnamed: 0,0,1
0,55828,4796
1,11778,22022


In [15]:
#classification report
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.825785,0.821165,0.824473,0.823475,0.824131
recall,0.920889,0.651538,0.824473,0.786214,0.824473
f1-score,0.870748,0.726583,0.824473,0.798665,0.819143
support,60624.0,33800.0,0.824473,94424.0,94424.0


### Logistic Regression

In [16]:
#setting parameters and fitting model
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')
logit.fit(X_train_scaled, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [17]:
#predicting target variable
y_pred = logit.predict(X_train_scaled)

print('>>>>>>>>>> Accuracy of Logistic Regression classifier on TRAIN set: {:.2f}'
     .format(logit.score(X_train_scaled, y_train)))

>>>>>>>>>> Accuracy of Logistic Regression classifier on TRAIN set: 0.59


In [18]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

Unnamed: 0,0,1
0,22436,38188
1,70,33730


In [19]:
#classification report
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99689,0.469006,0.594828,0.732948,0.807929
recall,0.370084,0.997929,0.594828,0.684007,0.594828
f1-score,0.539781,0.638113,0.594828,0.588947,0.57498
support,60624.0,33800.0,0.594828,94424.0,94424.0


### Random Forest

In [20]:
#setting parameters and fitting model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=8,
                            n_estimators=100,
                            max_depth=15, 
                            random_state=123)

rf.fit(X_train_scaled, y_train)

RandomForestClassifier(max_depth=15, min_samples_leaf=8, random_state=123)

In [21]:
#predicting target variable
y_pred = rf.predict(X_train_scaled)

In [22]:
print('>>>>>>>>>> Accuracy of Random Forest classifier on TRAIN set: {:.2f}'
     .format(rf.score(X_train_scaled, y_train)))

>>>>>>>>>> Accuracy of Random Forest classifier on TRAIN set: 0.94


In [23]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

Unnamed: 0,0,1
0,57824,2800
1,2486,31314


In [24]:
#classification report
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
class_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95878,0.917922,0.944018,0.938351,0.944154
recall,0.953814,0.92645,0.944018,0.940132,0.944018
f1-score,0.95629,0.922166,0.944018,0.939228,0.944075
support,60624.0,33800.0,0.944018,94424.0,94424.0


## Validate

### Gradient Boost

In [25]:
#predicting target variable
y_pred_val = search.predict(X_validate_scaled)

In [26]:
print('>>>>>>>>>> Accuracy of Gradient Boost on VALIDATE set: {:.2f}'
     .format(search.score(X_validate_scaled, y_validate)))

>>>>>>>>>> Accuracy of Gradient Boost on VALIDATE set: 0.96


In [27]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_pred_val, y_validate))
cm

Unnamed: 0,0,1
0,38862,1215
1,1554,21319


In [28]:
#classification report
class_report = pd.DataFrame(classification_report(y_validate, y_pred_val, output_dict=True))
class_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.969683,0.93206,0.956013,0.950871,0.956215
recall,0.96155,0.946081,0.956013,0.953816,0.956013
f1-score,0.965599,0.939018,0.956013,0.952309,0.956084
support,40416.0,22534.0,0.956013,62950.0,62950.0


### KNN

In [29]:
#predicting target variable
y_pred = knn.predict(X_validate_scaled)

In [30]:
print('>>>>>>>>>> Accuracy of KNN classifier on VALIDATE set: {:.2f}'
      .format(knn.score(X_validate_scaled, y_validate)))

>>>>>>>>>> Accuracy of KNN classifier on VALIDATE set: 0.72


In [31]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_validate, y_pred))
cm

Unnamed: 0,0,1
0,34569,5847
1,11667,10867


In [32]:
#classifiation report
report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.747664,0.650174,0.721779,0.698919,0.712766
recall,0.85533,0.482249,0.721779,0.668789,0.721779
f1-score,0.797881,0.553761,0.721779,0.675821,0.710494
support,40416.0,22534.0,0.721779,62950.0,62950.0


### Random Forest

In [33]:
#predicting target variable
y_pred = rf.predict(X_validate_scaled)

In [34]:
print('>>>>>>>>>> Accuracy of Random Forest on VALIDATE set: {:.2f}'
      .format(rf.score(X_validate_scaled, y_validate)))

>>>>>>>>>> Accuracy of Random Forest on VALIDATE set: 0.92


In [35]:
cm = pd.DataFrame(confusion_matrix(y_validate, y_pred))
cm

Unnamed: 0,0,1
0,37783,2633
1,2400,20134


In [36]:
report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))

## Test

## Gradient Boost

In [37]:
#predicting target variable
y_pred_val = search.predict(X_test_scaled)

In [38]:
print('>>>>>>>>>> Accuracy of Gradient Boost on TEST set: {:.2f}'
      .format(search.score(X_test_scaled, y_test)))

>>>>>>>>>> Accuracy of Gradient Boost on TEST set: 0.96


In [39]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_pred_val, y_test))
cm

Unnamed: 0,0,1
0,41659,1270
1,1645,22873


In [40]:
#classification report
class_report = pd.DataFrame(classification_report(y_test, y_pred_val, output_dict=True))
class_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.970416,0.932906,0.956781,0.951661,0.956989
recall,0.962013,0.947397,0.956781,0.954705,0.956781
f1-score,0.966196,0.940096,0.956781,0.953146,0.956853
support,43304.0,24143.0,0.956781,67447.0,67447.0


## Top Features for Gradient Boost

In [41]:
#obtaining names of features
top_feature = pd.DataFrame(X_train_scaled.columns)
#adding importance measure of values
top_feature['values'] = search.best_estimator_.feature_importances_
#finding top 10 features
top_feature.sort_values('values', ascending = False).head(10)

Unnamed: 0,0,values
23,epa,0.740295
41,closest_dist,0.159671
16,down,0.02843
42,closest_x,0.010697
43,closest_y,0.007702
17,yardsToGo,0.006566
11,time_since_last_x,0.006161
0,x,0.005241
22,absoluteYardlineNumber,0.0051
7,playDirection,0.004058


# Conclusions

- Our Gradient Boost Model was 96% accurate at predicting a pass being stopped.
- EPA & closest_dist turned out to be significant features in our model.
    - EPA was provided by Kaggle
    - closest_dist was a feature engineered
- Success in defending the pass truly depends on the defenders' ability to prevent separation from receiver and their reaction time.
- When pressure is applied to the quarter back, the completion percentage significantly decreases.
- Dime formation (6 defensive backs) had the best success in stopping the pass.

# Next Steps

- work out mislabels and small bugs for closest_dist feature
- use similar algorithm to find the distance of all cornerbacks to their defensive assignments i.e. WR, RB, TE, etc
- further analyze the components of EPA to understand their influence on the model
- explore trick plays to see if the same features carry over from the traditional offensive setup