### 1.0 Import Basic Libraries

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 1.1 Import College Football Team Data From SportsReference.com

In [38]:
from sportsreference.ncaaf.teams import Teams

### 2.0 Pull Data from Teams Object: PPG, YPP, Penalties, Penalty Yards, Pass YPP, Rush YPP

In [39]:
teams = Teams('2018')

In [40]:
points_dict = {}

for team in teams:
    
    name = team.name
    
    points_for = team.points_per_game
    plays = team.plays
    yards = team.yards
    penalties = team.penalties
    penalty_yards = team.yards_from_penalties
    pass_ypp = team.pass_yards / team.pass_attempts
    rush_ypp = team.rush_yards_per_attempt
    first_downs = team.first_downs
    
    ypp = yards / plays
    
    penalty_ypp = (yards + penalty_yards) / plays
    
    adv_ypp = (yards + penalty_yards) / (plays + penalties)
    
    first_downs_pp = first_downs / plays
    
    points_dict[str(name)] = (points_for, ypp, penalty_ypp, adv_ypp, round(pass_ypp,1), rush_ypp, first_downs_pp)

### 2.1 Convert to a DataFrame

In [41]:
new_df = pd.DataFrame.from_dict(points_dict, orient='index')

In [42]:
new_df.reset_index(inplace=True)

In [43]:
new_df.columns = ['Team_Name', 'PPG', 'YPP', 'Penalty_YPP', 'Advanced_YPP', 'Pass_YPP', 'Rush_YPP', 'First_Downs_PP']

In [44]:
new_df.head()

Unnamed: 0,Team_Name,PPG,YPP,Penalty_YPP,Advanced_YPP,Pass_YPP,Rush_YPP,First_Downs_PP
0,Clemson,44.3,7.364017,7.944212,7.465269,8.2,6.6,0.333333
1,Syracuse,40.2,5.748768,6.476601,5.949095,7.4,4.4,0.29064
2,North Carolina State,33.8,6.050398,6.643236,6.26125,8.2,3.8,0.320955
3,Boston College,32.0,5.428188,6.088591,5.613861,7.7,4.1,0.260403
4,Wake Forest,32.8,5.489672,6.00243,5.710983,6.9,4.5,0.300122


### 2.2 Import Statsmodels for Regressions and Import Scale to Standardize Data

In [45]:
import statsmodels.api as sm

In [46]:
from sklearn.preprocessing import scale

In [47]:
df_as_matrix = new_df.values

scaled_vectors = scale(df_as_matrix[:,1:])



In [48]:
scaled_df = pd.DataFrame(scaled_vectors, columns=new_df.columns[1:])

In [49]:
scaled_df.head()

Unnamed: 0,PPG,YPP,Penalty_YPP,Advanced_YPP,Pass_YPP,Rush_YPP,First_Downs_PP
0,2.191824,2.249747,1.871288,2.156953,0.719515,2.665733,1.204151
1,1.588736,-0.036503,-0.10584,-0.124045,-0.019178,-0.068842,-0.301704
2,0.64733,0.390429,0.118647,0.345575,0.719515,-0.814636,0.767542
3,0.382559,-0.490258,-0.628557,-0.628385,0.257832,-0.441739,-1.368242
4,0.500235,-0.403232,-0.744631,-0.482272,-0.480861,0.055456,0.032711


### 3.1 Regression #1: Multivariate Regression Between YPP, Penalty YPP, Advanced YPP (Penalties + Penalty Yards) vs. PPG

In [50]:
X = scaled_df[['YPP', 'Penalty_YPP', 'Advanced_YPP']]

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,PPG,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.742
Method:,Least Squares,F-statistic:,124.6
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,1.5400000000000002e-37
Time:,10:49:45,Log-Likelihood:,-94.889
No. Observations:,130,AIC:,197.8
Df Residuals:,126,BIC:,209.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.045,1.13e-14,1.000,-0.089,0.089
YPP,0.9778,0.580,1.685,0.094,-0.171,2.126
Penalty_YPP,-0.6216,0.410,-1.516,0.132,-1.433,0.190
Advanced_YPP,0.4906,0.894,0.548,0.584,-1.279,2.261

0,1,2,3
Omnibus:,7.6,Durbin-Watson:,1.774
Prob(Omnibus):,0.022,Jarque-Bera (JB):,7.358
Skew:,0.569,Prob(JB):,0.0252
Kurtosis:,3.254,Cond. No.,42.6


Conclusion: YPP has by far the largest influence on our response variable (PPG), but Advanced YPP also has a measurable influence.
The R-Squared of .748 suggests that we can explain 74.8% of the data's variance with these three explanatory variables.
The Adjusted R-Squared of .742 is a good benchmark for the rest of these regressions...

### 3.2 Regression #2: YPP vs. PPG

In [51]:
X = scaled_df['YPP']

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.739
Model:,OLS,Adj. R-squared:,0.737
Method:,Least Squares,F-statistic:,361.7
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,4.1699999999999996e-39
Time:,10:49:45,Log-Likelihood:,-97.252
No. Observations:,130,AIC:,198.5
Df Residuals:,128,BIC:,204.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.045,1.12e-14,1.000,-0.089,0.089
YPP,0.8594,0.045,19.018,0.000,0.770,0.949

0,1,2,3
Omnibus:,4.771,Durbin-Watson:,1.748
Prob(Omnibus):,0.092,Jarque-Bera (JB):,4.666
Skew:,0.464,Prob(JB):,0.097
Kurtosis:,2.961,Cond. No.,1.0


Conclusions: Both R-Squared and Adjusted R-Squared decreased, as to be expected by removing two of our three variables.

### 3.3 Regression #3: Penalty YPP vs. PPG

In [52]:
X = scaled_df['Penalty_YPP']

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.669
Model:,OLS,Adj. R-squared:,0.667
Method:,Least Squares,F-statistic:,259.0
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,1.51e-32
Time:,10:49:45,Log-Likelihood:,-112.54
No. Observations:,130,AIC:,229.1
Df Residuals:,128,BIC:,234.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.051,9.97e-15,1.000,-0.101,0.101
Penalty_YPP,0.8181,0.051,16.094,0.000,0.718,0.919

0,1,2,3
Omnibus:,2.398,Durbin-Watson:,1.676
Prob(Omnibus):,0.301,Jarque-Bera (JB):,2.433
Skew:,0.298,Prob(JB):,0.296
Kurtosis:,2.693,Cond. No.,1.0


Conclusions: R-Squared and Adjusted R-Squared dropped off massively, unsurprisingly considering this was our weakest explanatory variable in the multivariate regression.

### 3.4 Regression #4: Advanced YPP vs. PPG

In [53]:
X = scaled_df['Advanced_YPP']

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.719
Model:,OLS,Adj. R-squared:,0.717
Method:,Least Squares,F-statistic:,328.2
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,3.94e-37
Time:,10:49:45,Log-Likelihood:,-101.86
No. Observations:,130,AIC:,207.7
Df Residuals:,128,BIC:,213.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.047,1.08e-14,1.000,-0.093,0.093
Advanced_YPP,0.8482,0.047,18.115,0.000,0.756,0.941

0,1,2,3
Omnibus:,4.844,Durbin-Watson:,1.704
Prob(Omnibus):,0.089,Jarque-Bera (JB):,4.739
Skew:,0.467,Prob(JB):,0.0935
Kurtosis:,2.964,Cond. No.,1.0


Conclusions: This regression is the most interesting, as R-Squared and Adj R-Squared are significantly higher for this statistic (which includes penalty yards and treats each penalty as an additional play) versus the prior one, which only includes penalty yards but doesn't affect total offensive plays.

### 3.5 Regression #5: Multivariate Regression Between (YPP and Advanced YPP) and PPG

In [54]:
X = scaled_df[['YPP', 'Advanced_YPP']]

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.743
Model:,OLS,Adj. R-squared:,0.739
Method:,Least Squares,F-statistic:,183.9
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,3.13e-38
Time:,10:49:45,Log-Likelihood:,-96.064
No. Observations:,130,AIC:,198.1
Df Residuals:,127,BIC:,206.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.045,1.13e-14,1.000,-0.089,0.089
YPP,1.5414,0.448,3.441,0.001,0.655,2.428
Advanced_YPP,-0.6854,0.448,-1.530,0.128,-1.572,0.201

0,1,2,3
Omnibus:,4.498,Durbin-Watson:,1.781
Prob(Omnibus):,0.105,Jarque-Bera (JB):,4.369
Skew:,0.449,Prob(JB):,0.113
Kurtosis:,2.966,Cond. No.,19.9


Conclusions: Although the R-Squared and Adj R-Squared explanatory power is slightly less than the three-pronged multivariate regression, it uses one less variable and explains nearly as much.

### 3.6 Regression #6: Multivariate: YPP, Pass YPP, Rush YPP vs. PPG

In [55]:
X = scaled_df[['YPP', 'Pass_YPP', 'Rush_YPP']]

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.744
Model:,OLS,Adj. R-squared:,0.738
Method:,Least Squares,F-statistic:,122.1
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,3.96e-37
Time:,10:49:45,Log-Likelihood:,-95.863
No. Observations:,130,AIC:,199.7
Df Residuals:,126,BIC:,211.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.045,1.12e-14,1.000,-0.089,0.089
YPP,0.7829,0.112,6.985,0.000,0.561,1.005
Pass_YPP,0.1119,0.084,1.339,0.183,-0.054,0.277
Rush_YPP,-0.0171,0.074,-0.231,0.817,-0.163,0.129

0,1,2,3
Omnibus:,5.744,Durbin-Watson:,1.719
Prob(Omnibus):,0.057,Jarque-Bera (JB):,5.47
Skew:,0.499,Prob(JB):,0.0649
Kurtosis:,3.12,Cond. No.,4.84


Conclusions: A new multivariate regression, comparing YPP, Pass YPP, and Rush YPP against PPG shows us that YPP is by far the strongest explanatory variable. 

### 3.7 Regression #7: Pass YPP vs. PPG

In [56]:
X = scaled_df[['Pass_YPP']]

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.528
Model:,OLS,Adj. R-squared:,0.524
Method:,Least Squares,F-statistic:,143.2
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,1.3e-22
Time:,10:49:45,Log-Likelihood:,-135.66
No. Observations:,130,AIC:,275.3
Df Residuals:,128,BIC:,281.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.061,8.34e-15,1.000,-0.120,0.120
Pass_YPP,0.7267,0.061,11.967,0.000,0.607,0.847

0,1,2,3
Omnibus:,5.109,Durbin-Watson:,1.725
Prob(Omnibus):,0.078,Jarque-Bera (JB):,4.582
Skew:,0.394,Prob(JB):,0.101
Kurtosis:,3.473,Cond. No.,1.0


Conclusions: Passing YPP explains 52.8% of the variance according to R-Squared.

### 3.8 Regression #8: Rush YPP vs. PPG

In [57]:
X = scaled_df[['Rush_YPP']]

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.35
Model:,OLS,Adj. R-squared:,0.345
Method:,Least Squares,F-statistic:,68.97
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,1.23e-13
Time:,10:49:45,Log-Likelihood:,-156.45
No. Observations:,130,AIC:,316.9
Df Residuals:,128,BIC:,322.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.071,7.11e-15,1.000,-0.141,0.141
Rush_YPP,0.5917,0.071,8.305,0.000,0.451,0.733

0,1,2,3
Omnibus:,1.453,Durbin-Watson:,1.918
Prob(Omnibus):,0.484,Jarque-Bera (JB):,1.525
Skew:,0.238,Prob(JB):,0.467
Kurtosis:,2.765,Cond. No.,1.0


Conclusions: Rush YPP explains far less (35%) of the variance in PPG than Pass YPP (52.8%)

### 4.0 Examining the Effect of First Downs Per Play: FPP vs. PPG

In [58]:
X = scaled_df['First_Downs_PP']

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.657
Model:,OLS,Adj. R-squared:,0.654
Method:,Least Squares,F-statistic:,245.3
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,1.55e-31
Time:,10:49:45,Log-Likelihood:,-114.9
No. Observations:,130,AIC:,233.8
Df Residuals:,128,BIC:,239.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.052,9.79e-15,1.000,-0.102,0.102
First_Downs_PP,0.8106,0.052,15.661,0.000,0.708,0.913

0,1,2,3
Omnibus:,10.493,Durbin-Watson:,1.817
Prob(Omnibus):,0.005,Jarque-Bera (JB):,11.068
Skew:,0.583,Prob(JB):,0.00395
Kurtosis:,3.827,Cond. No.,1.0


### 4.1 Combining YPP and FPP

In [60]:
X = scaled_df[['YPP', 'First_Downs_PP']]

X = sm.add_constant(X)

y = scaled_df['PPG']

est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,PPG,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,194.5
Date:,"Wed, 04 Sep 2019",Prob (F-statistic):,2.17e-39
Time:,10:51:42,Log-Likelihood:,-93.334
No. Observations:,130,AIC:,192.7
Df Residuals:,127,BIC:,201.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.065e-16,0.044,1.15e-14,1.000,-0.087,0.087
YPP,0.6381,0.090,7.068,0.000,0.459,0.817
First_Downs_PP,0.2536,0.090,2.809,0.006,0.075,0.432

0,1,2,3
Omnibus:,7.878,Durbin-Watson:,1.736
Prob(Omnibus):,0.019,Jarque-Bera (JB):,7.56
Skew:,0.559,Prob(JB):,0.0228
Kurtosis:,3.383,Cond. No.,3.84


Conclusions: This was the best regression yet. By combining YPP and FPP we can explain 75.0% (Adj R-Sq) of the variance in the response variable, up from 73.7% (Adj R-Sq) when using just YPP as an explanatory variable.