# Cross Validation and Bootstrap

In [159]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model as sk_lm
from sklearn import discriminant_analysis as sk_lda
from sklearn import model_selection as sk_msel
from sklearn import metrics as sk_metrics
from sklearn import preprocessing as sk_pre
from sklearn import neighbors as sk_knn
import statsmodels.api as sm

In [106]:
# Regression analysis on Auto dataset
df_auto=pd.read_csv('data/Auto.csv')
df_auto['horsepower']=pd.to_numeric(df_auto['horsepower'], errors='coerce')
df_auto.dropna(inplace=True)
df_auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


**Fit with Model horsepower -> mpg**

In [154]:
# Fit with statsmodels
lm=sm.OLS.from_formula('mpg ~ np.power(horsepower,1)', df_auto)
lm.fit().summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.605
Method:,Least Squares,F-statistic:,599.7
Date:,"Thu, 12 Mar 2020",Prob (F-statistic):,7.03e-81
Time:,11:16:25,Log-Likelihood:,-1178.7
No. Observations:,392,AIC:,2361.0
Df Residuals:,390,BIC:,2369.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.9359,0.717,55.660,0.000,38.525,41.347
"np.power(horsepower, 1)",-0.1578,0.006,-24.489,0.000,-0.171,-0.145

0,1,2,3
Omnibus:,16.432,Durbin-Watson:,0.92
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.305
Skew:,0.492,Prob(JB):,0.000175
Kurtosis:,3.299,Cond. No.,322.0


**k-fold Cross Validation**

In [158]:
# Cross validation (manual with sklearn KFold and statsmodels)
degree=1
cv=10
formula=f"mpg ~ np.power(horsepower,{degree})"
errors=[]
for idx_train, idx_test in sk_msel.KFold(n_splits=cv, shuffle=True, random_state=17).split(df_auto):
    df_train=df_auto.iloc[idx_train]
    df_test=df_auto.iloc[idx_test]
    lm=sm.OLS.from_formula(formula, df_train).fit()
    X_test=df_test[['horsepower']]
    y_test=df_test['mpg']
    y_pre=lm.predict(df_test)
    error=sk_metrics.r2_score(y_test, y_pre)
    errors.append(error)

np.mean(errors)


0.6003521264214554

**Bootstrap**

In [190]:
# Monto Carlo Simulation (Resampling)

# Number of Resamplings
k=1000;

# Array of estimated parameters
beta0=[]
beta1=[]

for _ in range(k):
    # Resampling
    df_auto_sample=df_auto.sample(n=len(df_auto.index), replace=True)
    
    # Fit parameters
    lm_boot=sk_lm.LinearRegression().fit(df_auto_sample[['horsepower']],df_auto_sample['mpg'])
    
    beta0.append(lm_boot.intercept_)
    beta1.append(lm_boot.coef_[0])
    

In [185]:
np.mean(beta0)

39.95297952526497

In [186]:
np.mean(beta1)

-0.15816824536164786

In [192]:
np.std(beta0)

0.8399253847915956

In [191]:
np.std(beta1)

0.007197542519032084