In [1286]:
from sklearn import datasets, preprocessing, feature_selection
from sklearn.datasets import make_classification 
from itertools import compress

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dataset

In [1287]:
x, y = make_classification (
    n_samples = 1_000,
    n_features = 20,
    n_informative = 5,
    n_redundant = 5,
    n_repeated = 5,
    shuffle=False,
    n_classes = 2
)

In [1288]:
dataset = pd.DataFrame(x)
dataset.columns = ["informative_" + str(i) for i in range(5)] + ["redundant_" + str(i) for i in range(5)] +  ["repeated_" + str(i) for i in range(5)] + ["unknown_" + str(i) for i in range(5)]
# dataset['label'] = y

dataset

Unnamed: 0,informative_0,informative_1,informative_2,informative_3,informative_4,redundant_0,redundant_1,redundant_2,redundant_3,redundant_4,repeated_0,repeated_1,repeated_2,repeated_3,repeated_4,unknown_0,unknown_1,unknown_2,unknown_3,unknown_4
0,-1.024864,-2.287024,0.103913,2.079637,-2.575452,0.239404,-1.506705,3.759136,-0.206087,-0.500401,-0.206087,0.239404,-1.506705,-0.206087,0.239404,0.186047,-0.200794,-0.023115,0.406541,0.477367
1,0.507033,-1.350490,-0.580011,2.901171,-2.145666,-0.926561,-1.952269,2.073114,-0.413497,0.949133,-0.413497,-0.926561,-1.952269,-0.413497,-0.926561,-0.449376,-0.632102,1.351918,-1.688164,-0.513447
2,-0.604543,0.874251,-3.022116,1.187503,-0.506204,1.645873,-1.395468,-0.499539,0.131303,1.272753,0.131303,1.645873,-1.395468,0.131303,1.645873,-0.336753,0.510469,-0.918677,-1.539373,0.700301
3,0.715362,-1.396845,-2.458008,1.932827,-1.952807,0.459026,-1.137908,1.015858,0.299587,2.648792,0.299587,0.459026,-1.137908,0.299587,0.459026,1.198523,0.413854,0.942785,-0.219137,0.081458
4,0.314306,-1.952465,-0.540728,-0.296541,-0.282001,0.973085,0.956969,1.147558,0.118327,1.943984,0.118327,0.973085,0.956969,0.118327,0.973085,-0.103424,-0.836866,1.082778,0.033778,-0.921523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.219353,0.080834,-1.740572,1.120886,1.037645,1.626906,-0.550176,0.132838,-0.994260,2.218272,-0.994260,1.626906,-0.550176,-0.994260,1.626906,-1.218280,0.893218,0.515511,0.902253,1.101339
996,-2.738825,-2.499850,-0.194704,2.486101,2.568922,4.392141,-0.691258,5.089171,-3.694893,2.489984,-3.694893,4.392141,-0.691258,-3.694893,4.392141,-0.925407,1.700940,-0.186803,-1.195615,-0.092721
997,0.595267,-0.006641,-2.932358,1.004543,-0.858010,0.901687,-0.750242,-0.667507,0.458942,2.475731,0.458942,0.901687,-0.750242,0.458942,0.901687,-0.855048,0.921531,0.313968,-0.268816,0.751650
998,0.319569,-0.359971,-0.058148,-1.424569,-1.777482,-0.669629,0.712220,-0.548373,1.818500,-1.025214,1.818500,-0.669629,0.712220,1.818500,-0.669629,0.283275,0.527573,0.539470,-0.836228,-0.744267


# Feature selection


In [1289]:
def get_selected_data(selector, dataset):
    cols_idx = selector.get_support(indices=True)
    return dataset.iloc[:, cols_idx]

## Low variance

In [1290]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
dataset = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)
# print(dataset)

varThreshold = VarianceThreshold(threshold=0)
varThreshold.fit(dataset)

# dataset_removed_low_var = get_selected_data(varThreshold, dataset)
# dataset_removed_low_var

print(varThreshold.variances_)
print(varThreshold.get_support())
dataset

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]


Unnamed: 0,informative_0,informative_1,informative_2,informative_3,informative_4,redundant_0,redundant_1,redundant_2,redundant_3,redundant_4,repeated_0,repeated_1,repeated_2,repeated_3,repeated_4,unknown_0,unknown_1,unknown_2,unknown_3,unknown_4
0,-0.341967,-0.981842,0.365063,1.067249,-1.425991,-0.498216,-1.024373,1.356164,0.228348,-0.611131,0.228348,-0.498216,-1.024373,0.228348,-0.498216,0.186874,-0.206190,0.001645,0.400399,0.410240
1,0.570062,-0.249358,-0.055735,1.631695,-1.194724,-1.142567,-1.336866,0.422832,0.072637,-0.038948,0.072637,-1.142567,-1.336866,0.072637,-1.142567,-0.457708,-0.630377,1.382502,-1.615496,-0.559772
2,-0.091725,1.490662,-1.558289,0.454295,-0.312529,0.279045,-0.946358,-1.001313,0.481639,0.088797,0.481639,0.279045,-0.946358,0.481639,0.279045,-0.343461,0.493327,-0.897710,-1.472303,0.628493
3,0.694092,-0.285613,-1.211210,0.966381,-1.090946,-0.376846,-0.765720,-0.162434,0.607976,0.631970,0.607976,-0.376846,-0.765720,0.607976,-0.376846,1.213944,0.398307,0.971636,-0.201738,0.022643
4,0.455320,-0.720176,-0.031565,-0.565338,-0.191886,-0.092760,0.703505,-0.089529,0.471898,0.353756,0.471898,-0.092760,0.703505,0.471898,-0.092760,-0.106769,-0.831760,1.112222,0.041661,-0.959280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.137601,0.870113,-0.769794,0.408525,0.518216,0.268564,-0.353520,-0.651247,-0.363363,0.462028,-0.363363,0.268564,-0.353520,-0.363363,0.268564,-1.237694,0.869756,0.542553,0.877461,1.021111
996,-1.362390,-1.148298,0.181333,1.346515,1.342196,1.796725,-0.452466,2.092432,-2.390828,0.569282,-2.390828,1.796725,-0.452466,-2.390828,1.796725,-0.940599,1.664140,-0.162737,-1.141478,-0.147879
997,0.622593,0.801697,-1.503063,0.328590,-0.501836,-0.132217,-0.493834,-1.094295,0.727610,0.563656,0.727610,-0.132217,-0.493834,0.727610,-0.132217,-0.869226,0.897601,0.340155,-0.249548,0.678763
998,0.458453,0.525350,0.265352,-1.340366,-0.996604,-1.000578,0.531852,-1.028346,1.748281,-0.818294,1.748281,-1.000578,0.531852,1.748281,-1.000578,0.285504,0.510148,0.566612,-0.795612,-0.785745


## Linear regression

In [1291]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, mutual_info_classif

selectKBest = SelectKBest(f_regression, k=5)
selectKBest.fit(dataset, y)

dataset_linear_reg = get_selected_data(selectKBest, dataset)
dataset_linear_reg

Unnamed: 0,informative_0,informative_2,informative_3,redundant_4,repeated_2
0,-0.341967,0.365063,1.067249,-0.611131,-1.024373
1,0.570062,-0.055735,1.631695,-0.038948,-1.336866
2,-0.091725,-1.558289,0.454295,0.088797,-0.946358
3,0.694092,-1.211210,0.966381,0.631970,-0.765720
4,0.455320,-0.031565,-0.565338,0.353756,0.703505
...,...,...,...,...,...
995,0.137601,-0.769794,0.408525,0.462028,-0.353520
996,-1.362390,0.181333,1.346515,0.569282,-0.452466
997,0.622593,-1.503063,0.328590,0.563656,-0.493834
998,0.458453,0.265352,-1.340366,-0.818294,0.531852


## Linear regression and regularization

In [1292]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

lasso = LassoCV(cv=10, tol=0.05, random_state=10)
lasso.fit(dataset, y)
print("Coefficients:", lasso.coef_)

Coefficients: [ 0.11338362  0.00881932 -0.11826149  0.13985278  0.03530205 -0.00388922
 -0.         -0.00405105 -0.01576062  0.00039553 -0.         -0.0071693
 -0.         -0.00411473 -0.00242912  0.          0.00522297 -0.
 -0.00241527  0.        ]


In [1293]:
sfm = SelectFromModel(lasso, threshold=np.median(np.abs(lasso.coef_)))
sfm.fit(dataset, y)
n_features = sfm.transform(dataset).shape[1]
# print(n_rows, n_features)

max_iteration = 1000
iteration = 0
# sometimes there is less than 5 features when coef of all but n features = 0
while n_features > 5 and max_iteration > iteration:
    sfm.threshold += 0.0001
    dataset_new = sfm.transform(dataset)
    n_features = dataset_new.shape[1]
    # feature_names = compress(dataset.columns, sfm.get_support())
    # print(f'Threshold={sfm.threshold} Num features: {n_features}, {list(feature_names)}')
    iteration += 1

# print(iteration)
# print(sfm.get_support())
feature_names = compress(dataset.columns, sfm.get_support())
pd.DataFrame(dataset_new, columns=list(feature_names))

Unnamed: 0,informative_0,informative_2,informative_3,informative_4,redundant_3
0,-0.341967,0.365063,1.067249,-1.425991,0.228348
1,0.570062,-0.055735,1.631695,-1.194724,0.072637
2,-0.091725,-1.558289,0.454295,-0.312529,0.481639
3,0.694092,-1.211210,0.966381,-1.090946,0.607976
4,0.455320,-0.031565,-0.565338,-0.191886,0.471898
...,...,...,...,...,...
995,0.137601,-0.769794,0.408525,0.518216,-0.363363
996,-1.362390,0.181333,1.346515,1.342196,-2.390828
997,0.622593,-1.503063,0.328590,-0.501836,0.727610
998,0.458453,0.265352,-1.340366,-0.996604,1.748281


## Recursive feature selection

In [1294]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import LinearRegression

estimator = DecisionTreeRegressor()
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(dataset, y)

# print("Accuracy: ", np.mean(cross_val_score(estimator, dataset, y, scoring="accuracy")))

# print(selector.support_)
print(selector.ranking_)

dataset_resursive = get_selected_data(selector, dataset)
dataset_resursive

[ 8  6  1  1  1  1  4  3 10  2  1  7 11  5 16 13 14  9 12 15]


Unnamed: 0,informative_2,informative_3,informative_4,redundant_0,repeated_0
0,0.365063,1.067249,-1.425991,-0.498216,0.228348
1,-0.055735,1.631695,-1.194724,-1.142567,0.072637
2,-1.558289,0.454295,-0.312529,0.279045,0.481639
3,-1.211210,0.966381,-1.090946,-0.376846,0.607976
4,-0.031565,-0.565338,-0.191886,-0.092760,0.471898
...,...,...,...,...,...
995,-0.769794,0.408525,0.518216,0.268564,-0.363363
996,0.181333,1.346515,1.342196,1.796725,-2.390828
997,-1.503063,0.328590,-0.501836,-0.132217,0.727610
998,0.265352,-1.340366,-0.996604,-1.000578,1.748281


### SVC 

In [1295]:
from sklearn.svm import SVC

estimator = SVC(kernel="linear")
print("Accuracy: ", np.mean(cross_val_score(estimator, dataset, y, scoring="accuracy")))

selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(dataset, y)

print(selector.support_)
print(selector.ranking_)

dataset_svc = get_selected_data(selector, dataset)
dataset_svc

Accuracy:  0.657
[ True False  True  True False False False  True False False False False
 False  True False False False False False False]
[ 1  7  1  1 14 11  4  1  6  2  5  8  3  1 13 12 10 16  9 15]


Unnamed: 0,informative_0,informative_2,informative_3,redundant_2,repeated_3
0,-0.341967,0.365063,1.067249,1.356164,0.228348
1,0.570062,-0.055735,1.631695,0.422832,0.072637
2,-0.091725,-1.558289,0.454295,-1.001313,0.481639
3,0.694092,-1.211210,0.966381,-0.162434,0.607976
4,0.455320,-0.031565,-0.565338,-0.089529,0.471898
...,...,...,...,...,...
995,0.137601,-0.769794,0.408525,-0.651247,-0.363363
996,-1.362390,0.181333,1.346515,2.092432,-2.390828
997,0.622593,-1.503063,0.328590,-1.094295,0.727610
998,0.458453,0.265352,-1.340366,-1.028346,1.748281
