In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
import statsmodels.api as sm
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import \
    (Stepwise,
     sklearn_selected,
     sklearn_selection_path)
from l0bnb import fit_path
from sklearn.model_selection import train_test_split

In [3]:
### From the ISLP book ###
def nCp(sigma2, estimator, X, Y): #Cp is negative, since sklearn tries to maximize the score, but we want to minimize Cp
    "Negative Cp Statistic"
    n, p = X.shape
    Yhat = estimator.predict(X)
    RSS = np.sum((Y - Yhat)**2)
    return -(RSS + 2 * p * sigma2) / n
    

In [4]:
Df = pd.read_csv('Faellesdata_cleaned.csv')

In [14]:
x_train, x_test, y_train, y_test = train_test_split(Df.drop('IsMigratorInt', axis=1), Df['IsMigratorInt'], test_size=0.2, random_state=42)

In [15]:
### From the ISLP book ###
design = MS(x_train).fit(x_train, y_train)
Y = np.array(y_train)
X = design.transform(x_train, y_train)
sigma2 = sm.GLM(Y, X, family = sm.families.Binomial()).fit().scale
neg_Cp = partial(nCp, sigma2)

strategy = Stepwise.first_peak(design,
                               direction = 'forward',
                               max_terms = len(design.terms))

In [17]:
#Forward selection using MSE
Y_MSE = sklearn_selected(sm.GLM, strategy)
Y_MSE.fit(X, Y)
Y_MSE.selected_state_

('Rcyl',
 'age',
 'cfe',
 'ch',
 'feh',
 'mass',
 'mgfe',
 'mgh',
 'ofe',
 'oh',
 'phi',
 'vRcyl',
 'vphi',
 'vz',
 'z')

In [18]:
#Forward selection using Cp
Y_Cp = sklearn_selected(sm.GLM,
                        strategy,
                        scoring = neg_Cp)
Y_Cp.fit(X, Y)
Y_Cp.selected_state_

('Rcyl', 'mgfe', 'vphi')

In [19]:
#We repeat the same, but in the backwards direction
strategy = Stepwise.first_peak(design,
                                 direction = 'backwards',
                                 max_terms = len(design.terms))

In [21]:
Y_MSE = sklearn_selected(sm.GLM, strategy)
Y_MSE.fit(X, Y)
Y_MSE.selected_state_

('Rcyl',
 'age',
 'cfe',
 'ch',
 'feh',
 'mass',
 'mgfe',
 'mgh',
 'ofe',
 'oh',
 'phi',
 'vRcyl',
 'vphi',
 'vz',
 'z')

In [23]:
#Backwards selection using Cp
Y_Cp = sklearn_selected(sm.GLM,
                        strategy,
                        scoring = neg_Cp)
Y_Cp.fit(X, Y)
Y_Cp.selected_state_

('Rcyl', 'mgfe', 'vphi')