In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd 
import math
import random

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import *
from sklearn import datasets, linear_model
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

5.3.1

In [2]:
Auto = pd.read_csv('./Auto.csv', header=0, na_values='?')
Auto = Auto.dropna().reset_index(drop=True)
print(Auto.shape)

(392, 9)


In [3]:
np.random.seed(1)
train = np.random.choice(Auto.shape[0], 196, replace=False)
select = np.in1d(range(Auto.shape[0]), train)

lm = smf.ols ('mpg~horsepower', data = Auto[select]).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.620
Model:                            OLS   Adj. R-squared:                  0.618
Method:                 Least Squares   F-statistic:                     316.4
Date:                Tue, 10 Nov 2020   Prob (F-statistic):           1.28e-42
Time:                        13:08:16   Log-Likelihood:                -592.07
No. Observations:                 196   AIC:                             1188.
Df Residuals:                     194   BIC:                             1195.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     40.3338      1.023     39.416      0.0

In [4]:
preds = lm.predict(Auto)
square_error = pow((Auto['mpg'] - preds), 2)
print(np.mean(square_error[~select]))

23.361902892587224


In [5]:
lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', data=Auto[select]).fit()
preds = lm2.predict(Auto)
square_error = pow((Auto['mpg'] - preds), 2)
print(square_error[~select].mean())

20.252690858350043


In [6]:
lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', data=Auto[select]).fit()
preds = lm3.predict(Auto)
square_error = pow((Auto['mpg'] - preds), 2)
print(np.mean(square_error[~select]))

20.32560936577359


5.3.2

In [7]:
ols_fit = smf.ols ('mpg~horsepower', data=Auto).fit()
print(ols_fit.params)

Intercept     39.935861
horsepower    -0.157845
dtype: float64


In [8]:
glm_fit = sm.GLM.from_formula('mpg~horsepower', data=Auto).fit()
print(glm_fit.params)

Intercept     39.935861
horsepower    -0.157845
dtype: float64


In [9]:
x = pd.DataFrame(Auto.horsepower)
y = Auto.mpg

model = LinearRegression()
model.fit(x, y)
print(model.intercept_, model.coef_)

39.93586102117047 [-0.15784473]


In [10]:
k_fold = KFold(n_splits=x.shape[0])
test = cross_val_score(model, x, y, cv=k_fold, scoring='neg_mean_squared_error', n_jobs=-1)
print(np.mean(-test))

24.231513517929226


In [11]:
A = []
for porder in range(1, 6):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])
    k_fold = KFold(n_splits=x.shape[0])
    test = cross_val_score(model, x, y, cv=k_fold, scoring='neg_mean_squared_error', n_jobs=-1)
    A.append(np.mean(-test))
print(A)

[24.231513517929226, 19.248213124489745, 19.33498406411498, 19.424430307079398, 19.033198669299846]


5.3.3

In [12]:
np.random.seed(2)
A = []
for porder in range(1, 11):
    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])
    k_fold = KFold(n_splits=10) 
    test = cross_val_score(model, x, y, cv=k_fold, scoring='neg_mean_squared_error', n_jobs=-1)
    A.append(np.mean(-test))
print(A)

[27.439933652339864, 21.235840055801567, 21.336606183382038, 21.353886969306874, 20.905558736691837, 20.780544653507675, 20.952970598113758, 21.077108146457544, 21.035590598116325, 20.978001582517084]


5.3.4

In [13]:
Portfolio = pd.read_csv('./Portfolio.csv', header=0)

def alpha_fn(data, index):
    X = data.X[index]
    Y = data.Y[index]
    return (np.var(Y) - np.cov(X,Y)[0,1]) / (np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0,1])

print(alpha_fn(Portfolio, range(0, 100)))

0.5766511516104116


In [14]:
print(np.sort(np.random.choice(range(0, 100), size=100, replace=True)))

[ 1  4  4  7  8  8  8  9 10 15 15 16 17 18 19 20 21 22 22 22 26 31 31 32
 33 34 34 37 38 39 39 40 40 40 42 43 43 43 43 46 46 47 49 49 50 50 51 52
 52 55 56 57 58 60 61 62 63 63 66 67 67 68 68 69 70 70 70 70 72 72 73 74
 75 75 76 76 79 80 81 82 82 83 83 84 85 86 87 88 90 90 90 90 90 91 95 95
 96 96 97 99]


In [15]:
print(alpha_fn(Portfolio, np.random.choice(range(0, 100), size=100, replace=True)))

0.6323275807980026


In [16]:
def bootstrap(data, input_fun, iteration):
    n = Portfolio.shape[0]
    idx = np.random.randint(0, n, (iteration, n))
    stat = np.zeros(iteration)
    for i in range(len(idx)):
        stat[i] = input_fun(data, idx[i])
    
    return {'Mean': np.mean(stat), 'STD': np.std(stat)}
print(bootstrap(Portfolio, alpha_fn, 1000))

{'Mean': 0.5811900883897445, 'STD': 0.09408713019844589}
