In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

# Import data and format data, initialize variables
X = pd.read_csv('x_train__wo_outlier_KNN.csv',delimiter=',', index_col='id')
ydata = pd.read_csv('y_train_wo_outlier.csv',delimiter=',', index_col='id')

X_test = pd.read_csv('x_test_KNN.csv',delimiter=',', index_col='id')

y=ydata['y']

num_features=250

In [2]:
#Pearson Correlation
def Cor_selector(X, y,num_features):
    print("Cor")
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_features:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [3]:
#Chi squared test
def Chi_selector(X,y,num_features):
    print("Chi")
    #Normalize values as required for Chi2 test
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num_features)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    return chi_support, chi_feature

In [4]:
#Recursive Feature Elimination
def RFE_selector(X,y,num_features):
    X_norm = MinMaxScaler().fit_transform(X)
    #max_iter bumped up to 2000 in order to converge (1000 default too small)
    rfe_selector = RFE(estimator=LogisticRegression(max_iter=2000), n_features_to_select=num_features, step=10, verbose=5)
    rfe_selector.fit(X_norm, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    return rfe_support, rfe_feature

In [5]:
#Lasso : SelectFromModel (L1 norm)
def LassoModel(X,y,num_features):
    print("Lasso")
    X_norm = MinMaxScaler().fit_transform(X)
    embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1",solver='liblinear'), max_features=num_features)
    embeded_lr_selector.fit(X_norm, y)
    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    return embeded_lr_support, embeded_lr_feature

In [6]:
#Tree-based : SelectFromModel
def TreebasedModel(X,y,num_features):
    print("TBM")
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_features)
    embeded_rf_selector.fit(X, y)
    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    return embeded_rf_support, embeded_rf_feature

In [7]:
#LightGBM
def LightGBM(X,y,num_features):
    print("GBM")
    lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_features)
    embeded_lgb_selector.fit(X, y)
    embeded_lgb_support = embeded_lgb_selector.get_support()
    embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
    return embeded_lgb_support, embeded_lgb_feature

In [8]:
#MAIN
pd.set_option('display.max_rows', 250)
feature_name = X.columns.tolist()

#Initialize variables corresponding to respective tests
cor_support,_ = Cor_selector(X, y,num_features)
chi_support,_ = Chi_selector(X,y,num_features)
rfe_support,_ = RFE_selector(X,y,num_features)
embeded_lr_support,_ = LassoModel(X,y,num_features)
embeded_rf_support,_ = TreebasedModel(X,y,num_features)
embeded_lgb_support,_ = LightGBM(X,y,num_features)

# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_features)
#Count the selected times for each feature and sorts them

Cor
Chi
Fitting estimator with 832 features.


  c /= stddev[:, None]
  c /= stddev[None, :]


Fitting estimator with 822 features.
Fitting estimator with 812 features.
Fitting estimator with 802 features.
Fitting estimator with 792 features.
Fitting estimator with 782 features.
Fitting estimator with 772 features.
Fitting estimator with 762 features.
Fitting estimator with 752 features.
Fitting estimator with 742 features.
Fitting estimator with 732 features.
Fitting estimator with 722 features.
Fitting estimator with 712 features.
Fitting estimator with 702 features.
Fitting estimator with 692 features.
Fitting estimator with 682 features.
Fitting estimator with 672 features.
Fitting estimator with 662 features.
Fitting estimator with 652 features.
Fitting estimator with 642 features.
Fitting estimator with 632 features.
Fitting estimator with 622 features.
Fitting estimator with 612 features.
Fitting estimator with 602 features.
Fitting estimator with 592 features.
Fitting estimator with 582 features.
Fitting estimator with 572 features.
Fitting estimator with 562 features.
F

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,x85,True,True,True,True,True,True,6
2,x181,True,True,True,True,True,True,6
3,x87,False,True,True,True,True,True,5
4,x84,True,True,True,True,False,True,5
5,x785,True,True,True,True,True,False,5
6,x77,False,True,True,True,True,True,5
7,x668,True,True,True,True,True,False,5
8,x53,False,True,True,True,True,True,5
9,x506,True,True,True,True,True,False,5
10,x482,True,True,True,True,True,False,5


In [12]:
#Creates the csv file with removed features for train data
    
top_features = feature_selection_df.copy()
filtered_X = X.copy()

last_features = top_features.tail(len(top_features)-num_features)

for index, row in last_features.iterrows():
    filtered_X = filtered_X.drop(columns=[row.Feature])

filtered_X.to_csv('x_train_wo_outliers_dataval.csv', index=False)

NameError: name 'xdata' is not defined

In [11]:
#Creates the csv file with removed features for test data
    
top_features = feature_selection_df.copy()
filtered_X_test = X_test.copy()

last_features = top_features.tail(len(top_features)-num_features)

for index, row in last_features.iterrows():
    filtered_X_test = filtered_X_test.drop(columns=[row.Feature])

filtered_X_test.to_csv('x_test_dataval.csv', index=False)