In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import prep_plays
import wrangle_plays_data
from scipy import stats
pd.set_option('display.max_columns', None)

In [None]:
df = prep_plays.prep_plays_data()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.pass_stopped.value_counts()

In [None]:
df.columns

In [None]:
pd.set_option('display.max_colwidth', -1)
df.playDescription.head()

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle_plays_data.train_validate_test(df)

In [None]:
X_train.head()

In [None]:
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle_plays_data.min_max_scale(X_train, X_validate, X_test)

In [None]:
X_train_scaled

In [None]:
X_train.QB_under_pressure.value_counts()

In [None]:
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(500, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# Create crosstab of data I want to visualize
crosstab = pd.crosstab(df.pass_stopped, df.QB_under_pressure)
# Visualizes heatmap to see differences in values
sns.heatmap(crosstab, annot= True, cmap= 'Reds', fmt= 'd')

plt.title('Passes by QB Under Pressure')
locs, labels = plt.yticks()
plt.yticks(locs, ('Not Stopped', 'Stopped'))
plt.xticks(locs, ('Not Under Pressure', 'Under Pressure'))
plt.ylabel('')
plt.xlabel('')

In [None]:
incompletion_rate = (((df.pass_stopped == 1).sum()) / (df.pass_stopped.count())).round(2)
incompletion_rate

In [None]:
# List all the questions
features = ['QB_under_pressure','dime', 'four_three','three_four', 'nickel']

# subplot each questions side by side
# adding a line to measure where average autism rating is
_, ax = plt.subplots(nrows=1, ncols=5, figsize=(20, 4), sharey=True)
for i, feature in enumerate(features):
    sns.barplot(feature, 'pass_stopped', data=df, ax=ax[i], alpha=.8)
    ax[i].set_xlabel('')
    ax[i].set_ylabel('Incomplete Rate')
    ax[i].set_title(feature)
    ax[i].axhline(incompletion_rate, ls='--', color='grey')
print('Comparing If Schemes And QB Pressure Impact Incompletion Rate')

## Conclusion
- Dime and Nickel have a closer to 50/50 chance of stopping the pass over 3-4 and 4-3 defensive schemes based on the data

## Hypothesis 1

H0: There is no difference in stopped passes between nickel defense and average nickel defense

Ha: There is a difference in stopped passes between nickel defense and average nickel defense

In [None]:
alpha= .05
stopped_pass = df[df.pass_stopped == 1]
t, p = stats.ttest_1samp(stopped_pass.nickel, df.nickel.mean())

print(f't = {t:.2f}')
print(f'p = {p:.90f}')
print(f'Our p-value is less than our alpha: {p < alpha}')

## Hypothesis 2

H0: There is no difference in stopped passes between dime defense and average dime defense

Ha: There is a difference in stopped passes between dime defense and average dime defense

In [None]:
alpha= .05
stopped_pass = df[df.pass_stopped == 1]
t, p = stats.ttest_1samp(stopped_pass.dime, df.dime.mean())

print(f't = {t:.2f}')
print(f'p = {p:.90f}')
print(f'Our p-value is less than our alpha: {p < alpha}')

## Hypothesis 3

H0: There is no difference in stopped passes between 3-4 defense and average 3-4 defense

Ha: There is a difference in stopped passes between 3-4 defense and average 3-4 defense

In [None]:
alpha= .05
stopped_pass = df[df.pass_stopped == 1]
t, p = stats.ttest_1samp(stopped_pass.three_four, df.three_four.mean())

print(f't = {t:.2f}')
print(f'p = {p:.90f}')
print(f'Our p-value is less than our alpha: {p < alpha}')

## Hypothesis 4

H0: There is no difference in stopped passes between 4-3 defense and average 4-3 defense

Ha: There is a difference in stopped passes between 4-3 defense and average 4-3 defense

In [None]:
alpha= .05
stopped_pass = df[df.pass_stopped == 1]
t, p = stats.ttest_1samp(stopped_pass.four_three, df.four_three.mean())

print(f't = {t:.2f}')
print(f'p = {p:.90f}')
print(f'Our p-value is less than our alpha: {p < alpha}')

## Hypothesis 5

H0: A defense stopping a pass is independent of QB pressure

Ha: A defense stopping a pass is not independent of QB pressure

In [None]:
crosstab = pd.crosstab(df.pass_stopped, df.QB_under_pressure)
Chi2, p, degf, expected = stats.chi2_contingency(crosstab)
print(f'Our p-value is {p:.90f}.')
print(f'Our p-value is less than our alpha: {p < alpha}')

## Hypothesis 6

H0: A defense stopping a pass is independent of which down it is

Ha: A defense stopping a pass is not independent of which down it is

In [None]:
crosstab = pd.crosstab(df.pass_stopped, df.down)
Chi2, p, degf, expected = stats.chi2_contingency(crosstab)
print(f'Our p-value is {p:.90f}.')
print(f'Our p-value is less than our alpha: {p < alpha}')

# Modeling

In [None]:
# Import the functions I will need for modeling
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Establish our baseline
baseline = ((df.pass_stopped == 1).sum() / df.pass_stopped.count()).round(2)
baseline

## Feature Engineering

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
## Assessing the RFE using a linear regression model
lr =  LogisticRegression(random_state=123)

rfe = RFE(lr, 10)
lm_X_rfe_train = rfe.fit_transform(X_train_scaled,y_train)
lm_X_rfe_val = rfe.transform(X_validate_scaled)
lm_X_rfe_test = rfe.transform(X_test_scaled)

In [None]:
## Here we want to filter through and return only the best features
mask = rfe.support_ 
rfe_features = X_train_scaled.columns[mask]
print(f'selected {len(rfe_features)} features:', ', '.join(rfe_features))

In [None]:
# assign random forest to a variable
def random_forest(leaf, depth, estimator):
    rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=leaf,
                            n_estimators=estimator,
                            max_depth=depth, 
                            random_state=123)
    
    print('---------------------------- Train -------------------------------')
    
    # fit train data
    rf.fit(X_train_scaled, y_train)
    # assign predicitons
    y_pred = rf.predict(X_train_scaled)
    # assign probabilities
    y_pred_proba = rf.predict_proba(X_train_scaled)
    print('Accuracy of random forest classifier on training set: {:.2f}'
         .format(rf.score(X_train_scaled, y_train)))
    print('Training Data Matrix')
    print(confusion_matrix(y_train, y_pred))
    # print report
    print('Training Data Report')
    print(classification_report(y_train, y_pred))
    
    print('---------------------------- Validate -------------------------------')
    
    # assign predicitions
    y_pred = rf.predict(X_validate_scaled)
    # assign probabilities
    y_pred_proba = rf.predict_proba(X_validate_scaled)
    print('Accuracy of random forest classifier on validate set: {:.2f}'
         .format(rf.score(X_validate_scaled, y_validate)))
    print('Training Data Matrix')
    print(confusion_matrix(y_validate, y_pred))
    # print report
    print('Training Data Report')
    print(classification_report(y_validate, y_pred))
    
    print('--------------------- Important Features ---------------------------')
    feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train_scaled.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
    return random_forest, feature_importances
    

In [None]:
random_forest(6, 12, 100)

In [None]:
random_forest(8, 15, 100)

In [1]:
import MVP

Wrangle.py Loaded Successfully
Acquire.py Loaded Successfully
Prep.py Loaded Successfully


In [2]:
MVP.MVP()

---------------------------- Train -------------------------------
Accuracy of random forest classifier on training set: 0.91
Training Data Matrix
[[4085  521]
 [  93 2441]]
Training Data Report
              precision    recall  f1-score   support

           0       0.98      0.89      0.93      4606
           1       0.82      0.96      0.89      2534

    accuracy                           0.91      7140
   macro avg       0.90      0.93      0.91      7140
weighted avg       0.92      0.91      0.92      7140

---------------------------- Validate -------------------------------
Accuracy of random forest classifier on validate set: 0.88
Training Data Matrix
[[2724  439]
 [ 137 1460]]
Training Data Report
              precision    recall  f1-score   support

           0       0.95      0.86      0.90      3163
           1       0.77      0.91      0.84      1597

    accuracy                           0.88      4760
   macro avg       0.86      0.89      0.87      4760
weighted

NameError: name 'random_forest' is not defined