# Regression and Significance
If the data looks like such what will that say about educational performance and how confident are we of that

## Imports

In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression

## Data Loading

In [3]:
df = pd.read_csv("../data/dataframe.csv").dropna()
fips = pd.read_csv("../data/fips.csv")
df_onehot = pd.read_csv("../data/dataframe_onehot.csv")

## Regressions
Various regressions and corresponding summaries

### Dataset Label Split

In [23]:
X_broadband = df_onehot[["broadband"]]
X = df_onehot[["broadband", "poverty_percentage_0-17", "median_house_income", "unemployment_rate", "urban1", "urban2", "urban3", "urban4", "urban5", "urban6"]]
y = df_onehot["edu_score"]

### Regression of only broadband percentage against education score

In [24]:
X_broadband_const = sm.add_constant(X_broadband)
reg_broadband = sm.OLS(y, X_broadband_const).fit()

In [25]:
reg_broadband.summary()

0,1,2,3
Dep. Variable:,edu_score,R-squared:,0.42
Model:,OLS,Adj. R-squared:,0.42
Method:,Least Squares,F-statistic:,2264.0
Date:,"Tue, 22 Dec 2020",Prob (F-statistic):,0.0
Time:,07:58:12,Log-Likelihood:,831.34
No. Observations:,3127,AIC:,-1659.0
Df Residuals:,3125,BIC:,-1647.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.6097,0.021,76.051,0.000,1.568,1.651
broadband,1.4950,0.031,47.584,0.000,1.433,1.557

0,1,2,3
Omnibus:,207.885,Durbin-Watson:,1.455
Prob(Omnibus):,0.0,Jarque-Bera (JB):,328.299
Skew:,0.529,Prob(JB):,5.14e-72
Kurtosis:,4.183,Cond. No.,13.7


### Full Regression to check other significant statistics

In [26]:
X_const = sm.add_constant(X)
reg = sm.OLS(y, X_const).fit()

In [27]:
reg.summary()

0,1,2,3
Dep. Variable:,edu_score,R-squared:,0.599
Model:,OLS,Adj. R-squared:,0.598
Method:,Least Squares,F-statistic:,518.1
Date:,"Tue, 22 Dec 2020",Prob (F-statistic):,0.0
Time:,07:58:25,Log-Likelihood:,1409.3
No. Observations:,3127,AIC:,-2799.0
Df Residuals:,3117,BIC:,-2738.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.6960,0.034,49.300,0.000,1.629,1.763
broadband,0.6661,0.040,16.635,0.000,0.588,0.745
poverty_percentage_0-17,-0.0054,0.001,-9.356,0.000,-0.007,-0.004
median_house_income,6.622e-06,4e-07,16.559,0.000,5.84e-06,7.41e-06
unemployment_rate,-0.0110,0.002,-4.872,0.000,-0.015,-0.007
urban1,0.3648,0.020,18.706,0.000,0.327,0.403
urban2,0.2067,0.011,18.556,0.000,0.185,0.229
urban3,0.2774,0.010,29.105,0.000,0.259,0.296
urban4,0.2847,0.009,30.796,0.000,0.267,0.303

0,1,2,3
Omnibus:,119.158,Durbin-Watson:,1.604
Prob(Omnibus):,0.0,Jarque-Bera (JB):,263.04
Skew:,0.231,Prob(JB):,7.61e-58
Kurtosis:,4.344,Cond. No.,7.16e+20
