# NFL 2018 Defense Analyzer

# Importing Libraries

In [2]:
# acquire libraries
import pandas as pd
import matplotlib.pyplot as plt

#explore libraries
from scipy import stats
import seaborn as sns
import prep_plays
import prep_season
plt.rc("figure", figsize=(12, 7))
plt.rc("font", size=14)
from sklearn.model_selection import train_test_split

# model libraries
import wrangle_plays_data
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

pd.set_option('display.max_columns', None)

Acquire.py Loaded Successfully
Prep.py Loaded Successfully
Prep_Season.py Loaded Successfully
Wrangle.py Loaded Successfully


# Acquire 

- We acquired the data from kaggle.com as several .csv's but the data itself is provided by nextgenstats.nfl.com

In [None]:
nfl = pd.read_csv('plays.csv')

In [None]:
nfl.head()

In [None]:
nfl.shape

In [None]:
nfl.info()

In [None]:
nfl.describe().T

In [None]:
num_cols = nfl.columns[[(nfl[col].dtype == 'int64') | (nfl[col].dtype == 'float64') for col in nfl.columns]]
for col in num_cols:
    plt.hist(nfl[col])
    plt.title(col)
    plt.show()

**Takeaways:**
- There are some null values listed that will need some investigation
- More plays are being ran in the second and fourth quarter
- Less plays are being ran as the down gets greater
- Yards to go is skewed right(makes sense)
    - Less likely to lose yards than gain
- Most plays begin between home 20 and away 20
    - Hard to pin your opponent inside 20 for kickoff or punt
- Defenders in the box is a normal distribution
- Number of pass  rushers is a normal distribution
- Scores are skewed right
- Play result is skewed right slightly
- epa is fairly normal distribution

# Prepare

- Create a function that will acquire the plays.csv
- Keep only the useful columns that can help us determine the success of a defense(whether a pass was completed or not)
    - `playDescription`, `quarter`, `down`, `yardsToGo`, `possessionTeam`, `offenseFormation`, `personnelO`, `defendersInTheBox`, `numberOfPassRushers`, `personnelD`, `typeDropback`, `gameClock`, `absoluteYardlineNumber`, `epa`, `playType`, `passResult`, `playResult`
- Create a new column called `pass_stopped` 
    - Will change completion into 0
    - Will change incomplete and interception into 1
- Filter out data that is not a pass play(no fake punts, fake field goals, etc)
- Create new columns that extract positions from offensive personnel
    - RB, TE, WR
- Create new columns that extract positions from defensive personnel
    - DL, LB, DB
- Rename `typeDropback` to `QB_under_pressure` and change values into normal or scramble
- Rename `passResult` into `pass_stopped`
- Create formations out of personnel on the field

# Explore

In [None]:
train, validate, test = prep_plays.explore_plays_data()

In [None]:
train.T

In [None]:
alpha = .05

## Does the offense formation matter? i.e. (is a certain offensive formation harder to defend?)

- $H_0$: There is no dependence between offensive formation and pass stopped
- $H_a$: There is a dependence between offensive formation and pass stopped

In [None]:
observed = pd.crosstab(train.offenseFormation, train.pass_stopped)

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.catplot(x="offenseFormation", hue="pass_stopped", kind="count", data=train, height=8, aspect=2)._legend.remove()
plt.title('Do certain offensive formations have more more passes stopped than others?', size = 30)
plt.xlabel('Offensive Formation', size = 16)
plt.ylabel('Count', size = 20)
plt.legend(labels = ('Pass Completed', 'Pass Stopped'), loc='center right', frameon=False, fontsize='x-large')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Shotgun', 'Empty', 'Singleback', 'I Formation', 'Pistol', 'Jumbo', 'Wildcat'], size = 20)
plt.show()

**Takeaways:**
- There does not seem to be a certain formation that will have there pass stopped more than others
- After a statistical test, we can safely say that there is not dependence on stopping the play and the formation the offense is lined up in.

In [None]:
train.groupby('offenseFormation').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('offenseFormation').pass_stopped.count()

## Are passes stoped dependent on Down?

- $H_0$: There is no dependence between down and pass stopped
- $H_a$: There is a dependence between down and pass stopped

In [None]:
observed = pd.crosstab(train.down, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='down', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Down')
plt.ylabel('Pass Stopped %')
plt.title("Are Passes Stopped dependent on Down?")
plt.show()

**Takeaway:**
- There is a dependence between a pass being stopped and what down it is.
- more passes are stopped on 3rd down with 4th down right behind it


In [None]:
train.groupby('down').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('down').pass_stopped.count()

## Are EPA values dramatically different for passes stopped vs. passes completed?

- $H_0$: The EPA value is the same for passes completed and passes stopped
- $H_a$: The EPA value is different for passes completed and passes stopped

In [None]:
pass_completed = train[train.pass_stopped == 0]
pass_not_completed = train[train.pass_stopped == 1]

t, p = stats.ttest_ind(pass_completed.epa, pass_not_completed.epa)

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
plt.rc("figure", figsize=(10, 6))
sns.violinplot(train.pass_stopped, train.epa)
plt.xlabel('')
plt.xticks([0,1], ['Pass Completed', 'Pass Stopped'])
plt.yticks(size = 24)
plt.ylabel('EPA')
plt.title("Are Passes Stopped dependent on EPA?")
plt.show()

In [None]:
print(f"The EPA mean for passes completed is {pass_completed.epa.mean()}.")
print(f"The EPA minimum for passes completed is {pass_completed.epa.min()}.")
print(f"The EPA max for passes completed is {pass_completed.epa.max()}.")

In [None]:
print(f"The EPA mean for passes stopped is {pass_not_completed.epa.mean()}.")
print(f"The EPA minimum for passes stopped is {pass_not_completed.epa.min()}.")
print(f"The EPA max for passes stopped is {pass_not_completed.epa.max()}.")

**Takeaways:**
- On average the EPA is negative for passes stopped and the EPA is positive for passes completed
- The pass is usually stopped when the EPA is negative but not always.
- If the EPA is above 2.5 then it almost guarantees that the pass will be completed

## Are passes stopped dependent on QB pressure?

- $H_0$: There is no dependence between QB pressure and pass stopped
- $H_a$: There is a dependence between QB pressure and pass stopped

In [None]:
observed = pd.crosstab(train.QB_under_pressure, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='QB_under_pressure', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.xticks([0,1], ['No Pressure', 'Pressure Applied'])
plt.title("Are Passes Stopped dependent on Pressure Applied to QB?")
plt.show()

In [None]:
train.groupby('QB_under_pressure').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('QB_under_pressure').pass_stopped.count()

## Are passes stopped dependent on how many Defenders are in the Box?

- $H_0$: There is no dependence between defenders in the box and pass stopped
- $H_a$: There is a dependence between defenders in the box and pass stopped

In [None]:
observed = pd.crosstab(train.defendersInTheBox, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='defendersInTheBox', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Defenders in the Box')
plt.ylabel('Pass Stopped %')
plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on the number of Defenders in the Box?")
plt.show()

In [None]:
train.groupby('defendersInTheBox').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('defendersInTheBox').pass_stopped.count()

## Are passes stopped dependent on how many DL?

- $H_0$: There is no dependence between DL and pass stopped
- $H_a$: There is a dependence between DL and pass stopped

In [None]:
observed = pd.crosstab(train.DL, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='DL', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of DL')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on DL count?")
plt.show()

In [None]:
train.groupby('DL').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('DL').pass_stopped.count()

## Are passes stopped dependent on how many LB?

- $H_0$: There is no dependence between LB and pass stopped
- $H_a$: There is a dependence between LB and pass stopped

In [None]:
observed = pd.crosstab(train.LB, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='LB', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of LB')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on LB count?")
plt.show()

## Are passes stopped dependent on how many DB?

- $H_0$: There is no dependence between DB and pass stopped
- $H_a$: There is a dependence between DB and pass stopped

In [None]:
observed = pd.crosstab(train.DB, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='DB', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of DB')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped Dependent on DB count?")
plt.show()

In [None]:
train.groupby('DB').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('DB').pass_stopped.count()

## Are passes stopped dependent on how defensive formation(Nickel)?

- $H_0$: There is no dependence between Nickel formation and pass stopped
- $H_a$: There is a dependence between Nickel formation and pass stopped

In [None]:
observed = pd.crosstab(train.nickel, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='nickel', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the Nickle Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', 'Nickle Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(Dime)?


- $H_0$: There is no dependence between Dime formation and pass stopped
- $H_a$: There is a dependence between Dime formation and pass stopped

In [None]:
observed = pd.crosstab(train.dime, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='dime', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the Dime Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', 'Dime Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(4-3)?

- $H_0$: There is no dependence between 4-3 formation and pass stopped
- $H_a$: There is a dependence between 4-3 formation and pass stopped

In [None]:
observed = pd.crosstab(train.four_three, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='four_three', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the 4-3 Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', '4-3 Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(3-4)?

- $H_0$: There is no dependence between 3-4 formation and pass stopped
- $H_a$: There is a dependence between 3-4 formation and pass stopped

In [None]:
observed = pd.crosstab(train.three_four, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='three_four', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the 3-4 Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', '3-4 Formation'])
plt.show()

## What makes a defense good?

In [None]:
df = pd.read_csv('season.csv')
defensedf = df[df["position"].isin(["CB", "OLB", "SS","FS","ILB","DE","LB","MLB","S","DT","DL","DB"])]

In [None]:
plt.rcParams['figure.figsize']=(13,7)
posdf = defensedf.groupby('position')['event'].sum().reset_index()
#pos20 = posdf.sort_values(by='event', ascending=False)
posdf = posdf.sort_values(by=['event'], ascending =False)
#plt.grid()
sns.set_style("darkgrid")
sns.barplot(data=posdf, x='position', y= 'event', palette='mako')
sns.color_palette('Blues')
plt.title('Position and Incompletions', fontsize=13)
plt.xlabel('Defensive Position',fontsize=13)
plt.ylabel('Incomplete Passes',fontsize=13)
posdf

In [None]:
cbdf = defensedf[defensedf['position'] == 'CB']
olbdf = defensedf[defensedf['position'] == 'OLB']
ssdf = defensedf[defensedf['position'] == 'SS']
fsdf = defensedf[defensedf['position'] == 'FS']
ilbdf = defensedf[defensedf['position'] == 'ILB']

### Cornerback

In [None]:
prep_season.get_viz(cbdf)

### Outside Linebacker

In [None]:
prep_season.get_viz(olbdf)

### Strong Safety

In [None]:
prep_season.get_viz(ssdf)

### Free Safety

In [None]:
prep_season.get_viz(fsdf)

### Inside Linebacker

## Defensive Position Takeaways - Top 5 Attributes

In [None]:
prep_season.get_viz(ilbdf)

### CORNERBACK

**Ages:**
-    23, 25, 27, 26, 28

**Colleges:**
-    Ohio state, Florida state, lsu, Alabama, Florida

**Height:**
-    71", 72", 73", 70", 69"

**Weight:**
-    190lbs, 196lbs, 195lbs, 192lbs, 185lbs

### OUTSIDE LINEBACKER

**Ages:**
- 25, 27, 23, 28, 26

**College:**
- Georgia, Florida state, Southern California, Kentucky

**Height:**
- 75", 73", 76", 74", 77"

**Weight:**
- 250lbs, 255lbs, 265lbs, 240lbs, 235lbs

### STRONG SAFETY

**Ages:**
- 27, 24, 26, 30, 25

**College:**
- Ohio state, boston college, lsu, Georgia, Texas 

**Height:**
- 72",71",73",74",70"

**Weight:**
- 215lbs, 210lbs, 202lbs, 195lbs, 212lbs

### FREE SAFETY

**Ages:**
- 27, 25, 26, 22, 24

**College:**
- Utah, Rutgers, Alabama, South Carolina, ohio state

**Height:**
- 73", 71", 72", 70", 74"

**Weight:**
- 205lbs, 195lbs, 212lbs, 202lbs, 14lbs

### INSIDE LINEBACKER

**Ages:**
- 23, 28, 26, 24, 29

**College:**
- Kentucky, Alabama, Washington, Florida state,stanford

**Height:**
- 73", 72", 74", 75", 76"

**Weight:**
- 250lbs, 232lbs, 230lbs, 245lbs, 235lbs

## Who are the top defenders?

In [3]:
prep_season.top_defenders()

Dataframe Ready For Use


Unnamed: 0,defender,stopped_passes,total_plays,stopped_pass_perc
6,Marlon Humphrey,42,75,0.56
5,Stephon Gilmore,44,87,0.51
4,Steven Nelson,44,88,0.5
0,James Bradberry,49,99,0.49
7,Denzel Ward,42,85,0.49
1,Kyle Fuller,47,101,0.47
2,Jalen Ramsey,45,98,0.46
3,Eli Apple,45,101,0.45
8,Adoree' Jackson,41,97,0.42
9,Joe Haden,38,92,0.41


# Model

In [4]:
df = prep_plays.prep_plays_data()

In [5]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_plays_data.train_validate_test(df)

In [6]:
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle_plays_data.min_max_scale(X_train, X_validate, X_test)

## Train

### Gradient Boost

In [7]:
boost_params = {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}

In [8]:
ml = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)
ml.fit(X_train_scaled, y_train)
print(ml.score(X_train_scaled, y_train))

0.8733893557422969


In [9]:
y_pred = ml.predict(X_train_scaled)

In [10]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

Unnamed: 0,0,1
0,3953,689
1,215,2283


In [11]:
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
class_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.948417,0.76817,0.873389,0.858293,0.885355
recall,0.851573,0.913931,0.873389,0.882752,0.873389
f1-score,0.897389,0.834735,0.873389,0.866062,0.875469
support,4642.0,2498.0,0.873389,7140.0,7140.0


### KNN

In [12]:
# TRAIN creating object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

y_pred = knn.predict(X_train_scaled)

y_pred_proba = knn.predict_proba(X_train_scaled)

print('>>>>>>>>>> Accuracy of KNN classifier on TRAIN set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))

cm = pd.DataFrame(confusion_matrix(y_train, y_pred))

report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
cm

>>>>>>>>>> Accuracy of KNN classifier on TRAIN set: 0.87


Unnamed: 0,0,1
0,4109,533
1,398,2100


In [13]:
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.911693,0.797569,0.869608,0.854631,0.871766
recall,0.885179,0.840673,0.869608,0.862926,0.869608
f1-score,0.89824,0.818554,0.869608,0.858397,0.870361
support,4642.0,2498.0,0.869608,7140.0,7140.0


### Logistic Regression

In [14]:
# SCALED creating the object
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

# fit the model
logit.fit(X_train_scaled, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

y_pred = logit.predict(X_train_scaled)

y_pred_proba = logit.predict_proba(X_train_scaled)

print('>>>>>>>>>> Accuracy of Logistic Regression classifier on TRAIN set: {:.2f}'
     .format(logit.score(X_train_scaled, y_train)))
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))

report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
cm

Coefficient: 
 [[-2.44483839e-02 -8.01801840e-01  6.28262443e-01  5.57418310e-01
  -4.17226644e+01  2.91782250e-01  1.29781562e+00]]
Intercept: 
 [24.15648898]
>>>>>>>>>> Accuracy of Logistic Regression classifier on TRAIN set: 0.54


Unnamed: 0,0,1
0,1359,3283
1,5,2493


In [15]:
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.996334,0.431614,0.539496,0.713974,0.798761
recall,0.292762,0.997998,0.539496,0.64538,0.539496
f1-score,0.452547,0.602611,0.539496,0.527579,0.505049
support,4642.0,2498.0,0.539496,7140.0,7140.0


### SVM Modeling

In [16]:
svm = SVC(probability = True, random_state = 123)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_train)
y_pred_proba = svm.predict_proba(X_train)
y_pred_proba[0:10]

array([[0.64769658, 0.35230342],
       [0.64777807, 0.35222193],
       [0.64768894, 0.35231106],
       [0.64767822, 0.35232178],
       [0.64786787, 0.35213213],
       [0.64778453, 0.35221547],
       [0.6478598 , 0.3521402 ],
       [0.64776627, 0.35223373],
       [0.64793948, 0.35206052],
       [0.6478709 , 0.3521291 ]])

In [17]:
print('Accuracy of SVM classifier on training set: {:.2f}'
      .format(svm.score(X_train, y_train)))

Accuracy of SVM classifier on training set: 0.65


### Random Forest

## Validate

### Gradient Boost

In [None]:
print(ml.score(X_validate_scaled, y_validate))

In [None]:
y_pred_val = ml.predict(X_validate_scaled)

In [None]:
cm = pd.DataFrame(confusion_matrix(y_pred_val, y_validate))
cm

In [None]:
class_report = pd.DataFrame(classification_report(y_validate, y_pred_val, output_dict=True))
class_report

### KNN

In [None]:
y_pred = knn.predict(X_validate_scaled)

y_pred_proba = knn.predict_proba(X_validate_scaled)

print('>>>>>>>>>> Accuracy of KNN classifier on VALIDATE set: {:.2f}'
      .format(knn.score(X_validate_scaled, y_validate)))

cm = pd.DataFrame(confusion_matrix(y_validate, y_pred))

report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))
cm

In [None]:
report

### SVM

In [None]:
print('>>>>>>>>>> Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_validate, y_validate)))

### Random Forest

## Test

In [None]:
d

# Conclusions

- Our Random Forest Model was 93% accurate at predicting a pass being stopped.
- EPA & closest_dist turned out to be significant features in our model.
- Success in defending the pass truly depends on the defenders' ability to prevent separation from receiver and their reaction time.
- When pressure is applied to the quarter back, the completion percentage significantly decreases.
- Dime formation (6 defensive backs) had the best success in stopping the pass.

# Next Steps

- Look into creating new features in our model 
    - Incorporating the best defensive defenders against the pass
- Answer new questions 
    - Does the pass distance increase or decrease the ability for a defender to stop a pass? 
    - Does a shorter or longer play influence a pass incompletion?
- work out mislabels and small bugs for closest_dist feature
- use similar algorithm to find the distance of all cornerbacks to their defensive assignments i.e. WR, RB, TE, etc
- further analyze the components of EPA to understand their influence on the model
- explore trick plays to see if the same features carry over from the traditional offensive setup