In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [31]:
import statsmodels.api as sm
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import \
    (Stepwise,
     sklearn_selected,
     sklearn_selection_path)
from l0bnb import fit_path
from sklearn.model_selection import train_test_split

In [32]:
### From the ISLP book ###
def CE(estimator, X, Y): #Cp is negative, since sklearn tries to maximize the score, but we want to minimize Cp
    "Cross Entropy"
    Yhat = estimator.predict(X)
    maskIsMigrator = Y == 1
    maskIsNotMigrator = Y == 0
    CrossEntropy = np.mean(
        Y[maskIsMigrator]*np.log(Yhat[maskIsMigrator])
        ) + np.mean(
            (1-Y[maskIsNotMigrator])*np.log(Yhat[maskIsNotMigrator])
            )
    return -CrossEntropy

In [33]:
Df = pd.read_csv('Faellesdata_cleaned.csv')


In [34]:
x_train, x_test, y_train, y_test = train_test_split(Df.drop('IsMigratorInt', axis=1), Df['IsMigratorInt'], test_size=0.2, random_state=42)

In [35]:
### From the ISLP book ###
design = MS(x_train).fit(x_train, y_train)
Y = np.array(y_train)
X = design.transform(x_train, y_train)
# sigma2 = sm.GLM(Y, X, family = sm.families.Binomial()).fit().scale
Cross_Entropy = partial(CE)

strategy = Stepwise.first_peak(design,
                               direction = 'forward',
                               max_terms = len(design.terms))

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
#Forward selection using Cross Entropy'
fit = partial(sm.GLM, family = sm.families.Binomial())
Y_CE = sklearn_selected(fit,
                        strategy,
                        scoring = CE)
Y_CE.fit(X, Y)
forward_accuracy = accuracy_score(y_test, Y_CE.predict(x_test) > 0.5)
forward = Y_CE.selected_state_
forward

('Rcyl', 'age', 'cfe', 'mass', 'mgfe', 'phi', 'vRcyl', 'vphi', 'vz')

In [38]:
#We repeat the same, but in the backwards direction
strategy = Stepwise.first_peak(design,
                                 direction = 'backwards',
                                 max_terms = len(design.terms))

In [39]:
#Backwards selection using Cp
fit = partial(sm.GLM, family = sm.families.Binomial())
Y_CE = sklearn_selected(fit,
                        strategy,
                        scoring = CE)
Y_CE.fit(X, Y)
backward_accuracy = accuracy_score(y_test, Y_CE.predict(x_test) > 0.5)
backward = Y_CE.selected_state_
backward

('age', 'cfe', 'mgfe', 'phi', 'vRcyl', 'vphi', 'vz')

In [40]:
%%capture output
#We do it all again, but for absolute value of vz

# Save the captured output to a text file
with open('Forward_backward.txt', 'w') as f:
    f.write(output.stdout)


Df['vz'] = abs(Df['vz'])
x_train, x_test, y_train, y_test = train_test_split(Df.drop('IsMigratorInt', axis=1), Df['IsMigratorInt'], test_size=0.2, random_state=42)
design = MS(x_train).fit(x_train, y_train)
Y = np.array(y_train)
X = design.transform(x_train, y_train)
Cross_Entropy = partial(CE)

strategy = Stepwise.first_peak(design,
                               direction = 'forward',
                               max_terms = len(design.terms))
fit = partial(sm.GLM, family = sm.families.Binomial())
Y_CE = sklearn_selected(fit,
                        strategy,
                        scoring = CE)
Y_CE.fit(X, Y)
print('\n')
print('Forward:          ', forward)
print('Accuracy: ', forward_accuracy, '\n')
print('Forward (abs vz): ', Y_CE.selected_state_)
print('Accuracy: ', accuracy_score(y_test, Y_CE.predict(x_test) > 0.5), '\n')

strategy = Stepwise.first_peak(design,
                                 direction = 'backwards',
                                 max_terms = len(design.terms))
fit = partial(sm.GLM, family = sm.families.Binomial())
Y_CE = sklearn_selected(fit,
                        strategy,
                        scoring = CE)
Y_CE.fit(X, Y)
print('Backwards:          ', backward)
print('Accuracy: ', backward_accuracy, '\n')
print('Backwards (abs vz): ', Y_CE.selected_state_)
print('Accuracy: ', accuracy_score(y_test, Y_CE.predict(x_test) > 0.5), '\n')

In [41]:
print(np.genfromtxt('Forward_backward.txt', delimiter='\n', dtype=str))

["Forward:           ('Rcyl', 'age', 'cfe', 'mass', 'mgfe', 'phi', 'vRcyl', 'vphi', 'vz')"
 'Accuracy:  0.8455'
 "Forward (abs vz):  ('Rcyl', 'age', 'cfe', 'mass', 'mgfe', 'ofe', 'phi', 'vRcyl', 'vphi', 'vz')"
 'Accuracy:  0.842'
 "Backwards:           ('age', 'cfe', 'mgfe', 'phi', 'vRcyl', 'vphi', 'vz')"
 'Accuracy:  0.841'
 "Backwards (abs vz):  ('age', 'cfe', 'mgfe', 'phi', 'vRcyl', 'vphi', 'vz', 'z')"
 'Accuracy:  0.84']
