In [2]:
# libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import warnings
import time

warnings.filterwarnings('ignore')
warnings.filterwarnings(action="ignore", module="sklearn", message="^n_iter")

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, LogisticRegression
from sklearn.utils import resample
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, RFECV, SelectKBest, SelectFromModel
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.metrics import classification_report, roc_curve, confusion_matrix, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, learning_curve

from sklearn.kernel_approximation import Nystroem

In [3]:
data = pd.read_csv("data.csv")
data.drop("Unnamed: 0", axis = 1, inplace = True)

In [4]:
# Sample for logistic and SVM
data_log = data.copy()
data_svm = data.copy()

In [4]:
var = data_log.select_dtypes(exclude = "object")
vif_data = pd.DataFrame()
vif_data["feature"] = var.columns
vif_data["VIF"] = [variance_inflation_factor(var.values, i)for i in range(len(var.columns))]

In [None]:
vif_data[vif_data["VIF"]> 10].feature

In [50]:
# Processing for Logistic
# Removing Multicolinearity
high_vif = ["fico_range_high", "sec_app_fico_range_high", "fico_range_low", "sec_app_fico_range_low",
               "loan_amnt", "open_acc", "tot_hi_cred_lim","num_rev_tl_bal_gt_0", "issue_d_year","total_acc",
               "num_op_rev_tl","pct_tl_nvr_dlq","total_rev_hi_lim","num_sats","num_rev_accts","total_bc_limit",
               "bc_util","sec_app_open_acc","num_actv_rev_tl","num_bc_sats","total_bal_ex_mort","revol_util","term",
                "acc_open_past_24mths","all_util","pub_rec", "tot_cur_bal"]
data_log.drop(high_vif, axis = 1, inplace = True)

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from scipy.sparse import csr_matrix

class oneHotEncodeCategorical(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.mycategories = []
        for column in X.columns.values.tolist():
            self.mycategories.append(X[column].unique())
        self.ohe = OneHotEncoder(categories = self.mycategories).fit(X)
        return self

    def transform(self, X, y=None):
        index = X.index
        X = self.ohe.transform(X)
        columns = self.ohe.get_feature_names()
        X = csr_matrix(X)
        X = pd.SparseDataFrame(data=X, index=index, columns=columns, default_fill_value=0, dtype=np.int64)
        return X

In [13]:
oneHotEncodeCategorical()

oneHotEncodeCategorical()

In [6]:
# # Upsampling for imbalance
# def binary_upsample(data, label):
#     value_count = data[label].value_counts()
#     frqs = data[label].value_counts().values
#     idc = data[label].value_counts().index
#     minorfrq = value_count[frqs == min(frqs)].index[0] 
#     n_sample = max(frqs) - min(frqs)
#     minorindex = data[label][data[label] == minorfrq].index.values
#     resampled = resample(minorindex, n_samples = n_sample)
#     _ = pd.DataFrame(list(data.index) + list(resampled), columns =["Index"])
#     balanced = pd.merge(_, data, how ="left", left_on= "Index", right_on= data.index)
#     return balanced

# dum_log = binary_upsample(dum_log, "default").reindex()
# dum_svm = binary_upsample(dum_svm, "default").reindex()
# dum_log.drop("Index", axis = 1, inplace = True)
# dum_svm.drop("Index", axis = 1, inplace = True)

In [None]:
# Processing done !!!
# dum_log.default.mean()

In [18]:
# SVM Data 
target = dum_svm["default"]
features = dum_svm.drop("default", axis = 1)
svm_train, svm_test, Y_SVM_train, Y_SVM_test = train_test_split(features, target, test_size=0.3, random_state = 42)
print(svm_train.shape, Y_SVM_train.shape) 

(144856, 94436) (144856,)


In [19]:
# Splitting for training and testing Logistic
target = dum_log["default"]
features = dum_log.drop("default", axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.3, random_state = 42)
print(X_train.shape, Y_train.shape) 

(144856, 94436) (144856,)


### Logistic Regression

In [21]:
# Setup 
KF = KFold(5, shuffle = True, random_state= 42)

def SGD_LOG(X_train, Y_train, X_test, Y_test, params ):
    start = time.time()
    X1 = MinMaxScaler().fit_transform(X_train)
    X2 = MinMaxScaler().fit_transform(X_test)
    sgd = SGDClassifier(loss='log', shuffle= True, n_jobs = -1, warm_start = True, class_weight = "balanced", 
                        alpha= params[0] ,eta0 = params[1], 
                        l1_ratio = params[2], max_iter = params[3],
                        learning_rate= params[4], penalty = params[5]
                       ).fit(X1, Y_train)                     
    stop = time.time()
    Time = stop - start
    TrainCV = cross_val_score(sgd, X1, Y_train, cv=KF, scoring = "f1", n_jobs = -1)
    train_F1Score = f1_score(Y_train, sgd.predict(X1))
    train_ac = sgd.score(X1, Y_train)
    Train_CM = confusion_matrix(Y_train, sgd.predict(X1))
    
    TestCV = cross_val_score(sgd, X2, Y_test, cv=KF, scoring = "f1", n_jobs = -1)
    test_F1Score = f1_score(Y_test, sgd.predict(X2))
    tes_ac = sgd.score(X2, Y_test) 
    Test_CM = confusion_matrix(Y_test, sgd.predict(X2))
    
    return TrainCV, train_F1Score , train_ac, Train_CM, TestCV, test_F1Score, tes_ac, Test_CM, Time

In [None]:
def scaler(x):
    X = MinMaxScaler().fit_transform(x)
    return X

In [None]:
# Logistic Regression SGD - GridSearch
pipeline = Pipeline([('model',SGDClassifier(loss='log', shuffle=True, n_jobs = -1, warm_start= True,class_weight = "balanced",
                                            random_state = 42, eta0 = 0.0001))])
X= scaler(X_train)
# pipeline.get_params().keys()
param_grid = {'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 ],
                  "model__max_iter": [1000, 2000, 3000, 4000],
                  "model__penalty": ["elasticnet", "l1", "l2"],
                  "model__alpha" : [0.1, 0.01, 0.001, 0.0001,0.5, 0.005, 0.00005],
                  "model__warm_start": [True, False],
                  "model__eta0": [0.1, 0.001, 0.01, 0.0001, 0.5, 0.005, 0.00005],
                 "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 100, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_train)
print(search.best_params_, search.best_score_)

In [None]:
SGD_LOG(svm_train, Y_SVM_train,svm_test, Y_SVM_test,  params = [0.0001, 0.1, 0.1, 1000, "optimal", "elasticnet"])

In [180]:
# SelectK - Chi2 
pipeline = Pipeline([('selector',SelectKBest(chi2)),
                     ('model',SGDClassifier(loss='log', shuffle=True,n_jobs = -1, 
                                            random_state = 42, eta0 = 0.0001, warm_start = True))])  
                                                                                    
# pipeline.get_params().keys()
X= scaler(X_train)
param_grid = {'selector__k': range(50, 129, 10),
              'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7],
              "model__max_iter": [1000, 2000, 10000],
              "model__penalty": ["elasticnet", "l1", "l2"],
              "model__alpha" : [0.1, 0.01, 0.001, 0.0001],
              "model__warm_start": [True, False],
              "model__eta0": [0.1, 0.001, 0.0001],
              "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 500, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
{'selector__k': 110, 'model__warm_start': True, 'model__penalty': 'elasticnet', 'model__max_iter': 2000, 'model__learning_rate': 'constant', 'model__l1_ratio': 0.3, 'model__eta0': 0.001, 'model__alpha': 0.01} 0.8344969718972861


In [202]:
chi_train = SelectKBest(chi2, k = 110).fit_transform(X_train, Y_train)
chi_test = SelectKBest(chi2, k = 110).fit_transform(X_test, Y_test)
SGD_LOG(chi_train, Y_train, chi_test, Y_test, params = [0.1, 0.001, 0.3, 2000, "constant", "elasticnet"])
# Overfitting

(array([0.8329343 , 0.80411774, 0.8166582 , 0.78063273, 0.76174656]),
 0.8360889448445686,
 0.8298557947942625,
 array([[61801, 16053],
        [10506, 67737]]),
 array([0.8481305 , 0.8119106 , 0.77785641, 0.78343506, 0.81221601]),
 0.8369290629809228,
 0.8321051136788293,
 array([[26844,  6800],
        [ 4432, 28823]]),
 1.0061140060424805)

In [182]:
# SelectK - F_class
pipeline = Pipeline([('selector',SelectKBest(f_classif)),
                     ('model',SGDClassifier(loss='log', shuffle=True,n_jobs = -1, 
                                            random_state = 42, eta0 = 0.0001, warm_start = True))])  
                                                                                    
# pipeline.get_params().keys()
X= scaler(X_train)
param_grid = {'selector__k': range(50, 129, 10),
              'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7],
              "model__max_iter": [1000, 2000, 10000],
              "model__penalty": ["elasticnet", "l1", "l2"],
              "model__alpha" : [0.1, 0.01, 0.001, 0.0001],
              "model__warm_start": [True, False],
              "model__eta0": [0.1, 0.001, 0.0001],
              "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 500, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
{'selector__k': 100, 'model__warm_start': True, 'model__penalty': 'elasticnet', 'model__max_iter': 2000, 'model__learning_rate': 'constant', 'model__l1_ratio': 0.2, 'model__eta0': 0.1, 'model__alpha': 0.0001} 0.8368543823616132


In [200]:
f_train, f_test = SelectKBest(f_classif, k = 100).fit_transform(X_train, Y_train), SelectKBest(f_classif, k = 100).fit_transform(X_test, Y_test)
SGD_LOG(f_train, Y_train, f_test, Y_test, params = [0.01, 0.1, 0.2, 2000, "constant", "elasticnet"])

(array([0.83661399, 0.80812802, 0.8359965 , 0.79868352, 0.78101872]),
 0.8432354780584159,
 0.8378123858882618,
 array([[62690, 15164],
        [10153, 68090]]),
 array([0.74530605, 0.7943662 , 0.85370242, 0.7984639 , 0.86133812]),
 0.8364089045108378,
 0.833031883884662,
 array([[27174,  6470],
        [ 4700, 28555]]),
 0.9215970039367676)

In [203]:
# SelectK - MI
pipeline = Pipeline([('selector',SelectKBest(mutual_info_classif)),
                     ('model',SGDClassifier(loss='log', shuffle=True,n_jobs = -1, 
                                            random_state = 42, eta0 = 0.0001, warm_start = True))])  
                                                                                    
# pipeline.get_params().keys()
X= scaler(X_train)
param_grid = {'selector__k': range(50, 129, 10),
              'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7],
              "model__max_iter": [1000, 2000, 10000],
              "model__penalty": ["elasticnet", "l1", "l2"],
              "model__alpha" : [0.1, 0.01, 0.001, 0.0001],
              "model__warm_start": [True, False],
              "model__eta0": [0.1, 0.001, 0.0001],
              "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 10, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'selector__k': 120, 'model__warm_start': True, 'model__penalty': 'l1', 'model__max_iter': 1000, 'model__learning_rate': 'constant', 'model__l1_ratio': 0.1, 'model__eta0': 0.0001, 'model__alpha': 0.01} 0.889971955405548


In [204]:
MI_train, MI_test = SelectKBest(mutual_info_classif, k = 120).fit_transform(X_train, Y_train), SelectKBest(mutual_info_classif, k = 120).fit_transform(X_test, Y_test)
SGD_LOG(MI_train, Y_train, MI_test, Y_test, params = [0.01, 0.0001, 0.1, 1000, "constant", "l1"])

(array([0.88919756, 0.88890813, 0.88985031, 0.89083416, 0.89106962]),
 0.8900444567295511,
 0.877678622907551,
 array([[59724, 18130],
        [  964, 77279]]),
 array([0.8852323 , 0.88261253, 0.88681447, 0.87964626, 0.88807588]),
 0.8897381688079361,
 0.8783838323442802,
 array([[25937,  7707],
        [  429, 32826]]),
 3.9936108589172363)

In [185]:
# RFE - Orginal model
estimator = SGDClassifier(loss='log', shuffle=True, penalty = "l1", n_jobs = -1, random_state = 42, alpha = 0.0001,
                    learning_rate="optimal", warm_start = True, eta0 = 0.1, l1_ratio= 0.1, max_iter= 1000)
selector = RFECV(estimator, step = 1, cv = KF, n_jobs = -1, scoring = "f1").fit(scaler(X_train), Y_train)
print(selector.n_features_, selector.ranking_)

27 [  1   1  25   1  15  19  17   1   8  26  10  31  30  14  32  38  40  42
  16  46  33  53  52  50   1  61   1  60  18  64  20   1  24  72  45  58
  59  65  49   1   1  34  66  94  68   1  74  76  78  80  82  73   3  95
  86  88  92  93  91  90  96  98 100 102 104 105   1   1   1   1   1   1
   1  27  28   1  11  21   1   1   1   1  22   6   4   5  35  44  23   1
  51  12  55  37  39   1  43  48  75  69   1  56  41  70  47  77   9  67
  54  13  62   1   1  79  81  83  84  63  71  87  85  89  57   2  29  36
   7  97  99 101 103]


In [205]:
RFE = pd.DataFrame(selector.support_, index = X_train.columns).rename(columns = {0:"selected"})  
RFE_train, RFE_test = X_train[RFE[RFE.selected == True].index], X_test[RFE[RFE.selected == True].index]
SGD_LOG(RFE_train, Y_train, RFE_test, Y_test, params = [0.0001, 0.1, 0.1, 1000, "optimal", "l1"])

(array([0.89069626, 0.89233859, 0.89336479, 0.8933428 , 0.8940521 ]),
 0.8926145912470934,
 0.8804717579453801,
 array([[59894, 17960],
        [  698, 77545]]),
 array([0.89227182, 0.88948319, 0.89623087, 0.88881328, 0.89324824]),
 0.8920757990620001,
 0.8809847680832299,
 array([[26031,  7613],
        [  349, 32906]]),
 1.1994130611419678)

In [186]:
# Select from moel - logsitic
Model_sel = SelectFromModel(estimator = estimator).fit(scaler(X_train), Y_train)
Model_selected = pd.DataFrame(Model_sel.get_support(), index = X_train.columns).rename(columns = {0: "selected"})
ms_train, ms_test = X_train[Model_selected[Model_selected.selected == True].index], X_test[Model_selected[Model_selected.selected == True].index]

In [206]:
SGD_LOG(ms_train, Y_train, ms_test, Y_test, params = [0.0001, 0.1, 0.1, 1000, "optimal", "l1"])

(array([0.89160819, 0.8914523 , 0.89385507, 0.89474278, 0.89409667]),
 0.8930819057261525,
 0.8809330096030032,
 array([[59887, 17967],
        [  619, 77624]]),
 array([0.89559731, 0.8898702 , 0.89617412, 0.89087463, 0.89361416]),
 0.8930240117011335,
 0.8819264861956083,
 array([[26030,  7614],
        [  285, 32970]]),
 0.6216330528259277)

In [15]:
59887/(59887 + 17967)

0.7692218768464049

### SVM

In [58]:
from sklearn.kernel_approximation import Nystroem
def Kernel_tranf(data):
    num_features = data.shape[1]
    data = MinMaxScaler().fit_transform(data)
    NEW = Nystroem(random_state = 42, n_jobs = -1, n_components = num_features, gamma = (1/num_features)).fit_transform(data)
    return NEW

In [59]:
# Setup 
KF = KFold(5, shuffle = True, random_state= 42)

def SGD_SVM(X_train, Y_train, X_test, Y_test, params = [0.2,0.1, 0.55, 5000, "elasticnet"]):
    start = time.time()
    X1 = Kernel_tranf(X_train)
    X2 = Kernel_tranf(X_test)
    sgd = SGDClassifier(loss='hinge', shuffle= True, n_jobs = -1, random_state = 42, warm_start = True,  
                        alpha= params[0] ,eta0 = params[1], 
                        l1_ratio = params[2], max_iter = params[3],
                        learning_rate= params[4], penalty = params[5]
                       ).fit(X1, Y_train)                     
    stop = time.time()
    Time = stop - start
    TrainCV = cross_val_score(sgd, X1, Y_train, cv=KF, scoring = "f1", n_jobs = -1)
    train_F1Score = f1_score(Y_train, sgd.predict(X1))
    train_ac = sgd.score(X1, Y_train)
    Train_CM = confusion_matrix(Y_train, sgd.predict(X1))
    
    TestCV = cross_val_score(sgd, X2, Y_test, cv=KF, scoring = "f1", n_jobs = -1)
    test_F1Score = f1_score(Y_test, sgd.predict(X2))
    tes_ac = sgd.score(X2, Y_test) 
    Test_CM = confusion_matrix(Y_test, sgd.predict(X2))
    
    return TrainCV, train_F1Score, train_ac, Train_CM, TestCV, test_F1Score, tes_ac, Test_CM, Time

In [67]:
# SVM SGD - GridSearch
# svm_train, svm_test, Y_SVM_train, Y_SVM_test


pipeline = Pipeline([('model', LinearSVC(random_state = 42, dual))])

X = scaler(svm_train)
# pipeline.get_params().keys()
param_grid = {'model__C': [0.01, 0.01, 1, 10],
              "model__max_iter": [1000, 2000, 5000],
              "model__penalty": ["l1", "l2"],
             },

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 50, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_SVM_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'model__max_iter': 1000, 'model__C': 10} 0.6450826467355524


In [60]:
SGD_SVM(svm_train, Y_SVM_train, svm_test, Y_SVM_test, params = [1, 0.001, 0.1, 1000, "invscaling", "l2"])

(array([0.6654983 , 0.66609699, 0.666567  , 0.68027867, 0.66628217]),
 0.6693772645024864,
 0.5071974477408279,
 array([[ 1301, 76553],
        [  372, 77871]]),
 array([0., 0., 0., 0., 0.]),
 0.6640773209257743,
 0.4970926321768636,
 array([[    0, 33644],
        [    0, 33255]]),
 1.8931188583374023)

In [17]:
1301/(1301+76553)

0.01671076630616282

In [226]:
# SelectK - Chi2 
pipeline = Pipeline([('selector',SelectKBest(chi2)),
                     ('model',SGDClassifier(loss='hinge', shuffle=True,n_jobs = -1, 
                                            random_state = 42, eta0 = 0.0001, warm_start = True))])  
                                                                                    
# pipeline.get_params().keys()
X= scaler(svm_train)
param_grid = {'selector__k': range(50, 129, 10),
              'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7],
              "model__max_iter": [1000, 2000, 10000],
              "model__penalty": ["elasticnet", "l1", "l2"],
              "model__alpha" : [0.1, 0.01, 0.001, 0.0001],
              "model__warm_start": [True, False],
              "model__eta0": [0.1, 0.001, 0.0001],
              "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 500, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_SVM_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
{'selector__k': 110, 'model__warm_start': True, 'model__penalty': 'l1', 'model__max_iter': 10000, 'model__learning_rate': 'invscaling', 'model__l1_ratio': 0.1, 'model__eta0': 0.0001, 'model__alpha': 0.1} 0.6677650108854161


In [232]:
# svm_train, Y_SVM_train, svm_test, Y_SVM_test
chi_svm = SelectKBest(chi2, k = 110).fit_transform(svm_train, Y_SVM_train)
chi_svm_test = SelectKBest(chi2, k = 110).fit_transform(svm_test, Y_SVM_test)
SGD_SVM(chi_svm, Y_SVM_train, chi_svm_test, Y_SVM_test, params = [0.1, 0.0001, 0.1, 10000, "invscaling", "l1"])

(array([0.6654983 , 0.66609699, 0.666567  , 0.67438059, 0.66628217]),
 0.6677733208159086,
 0.5012460201028848,
 array([[    0, 77854],
        [    0, 78243]]),
 array([0., 0., 0., 0., 0.]),
 0.6640773209257743,
 0.4970926321768636,
 array([[    0, 33644],
        [    0, 33255]]),
 0.9616677761077881)

In [227]:
# SelectK - F_class
pipeline = Pipeline([('selector',SelectKBest(f_classif)),
                     ('model',SGDClassifier(loss='hinge', shuffle=True,n_jobs = -1, 
                                            random_state = 42, eta0 = 0.0001, warm_start = True))])  
                                                                                    
# pipeline.get_params().keys()
X= scaler(svm_train)
param_grid = {'selector__k': range(50, 129, 10),
              'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7],
              "model__max_iter": [1000, 2000, 10000],
              "model__penalty": ["elasticnet", "l1", "l2"],
              "model__alpha" : [0.1, 0.01, 0.001, 0.0001],
              "model__warm_start": [True, False],
              "model__eta0": [0.1, 0.001, 0.0001],
              "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 500, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_SVM_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
{'selector__k': 50, 'model__warm_start': False, 'model__penalty': 'l1', 'model__max_iter': 1000, 'model__learning_rate': 'invscaling', 'model__l1_ratio': 0.7, 'model__eta0': 0.1, 'model__alpha': 0.1} 0.6677650108854161


In [None]:
f_train, f_test = SelectKBest(f_classif, k = 50).fit_transform(svm_train, Y_SVM_train), SelectKBest(f_classif, k = 50).fit_transform(svm_test, Y_SVM_test)
SGD_SVM(f_train, Y_SVM_train, f_test, Y_SVM_test, params = [0.1, 0.1, 0.7, 2000, "invscaling", "l1"])

In [228]:
# SelectK - MI
pipeline = Pipeline([('selector',SelectKBest(mutual_info_classif)),
                     ('model',SGDClassifier(loss='hinge', shuffle=True,n_jobs = -1, 
                                            random_state = 42, eta0 = 0.0001, warm_start = True))])  
                                                                                    
# pipeline.get_params().keys()
X= scaler(svm_train)
param_grid = {'selector__k': range(50, 129, 10),
              'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7],
              "model__max_iter": [1000, 2000, 10000],
              "model__penalty": ["elasticnet", "l1", "l2"],
              "model__alpha" : [0.1, 0.01, 0.001, 0.0001],
              "model__warm_start": [True, False],
              "model__eta0": [0.1, 0.001, 0.0001],
              "model__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"]},

search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 10, 
    return_train_score = True,
    n_jobs= -1,
    scoring="f1",
    cv = KF,
    verbose=3)
search = search.fit(X, Y_SVM_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'selector__k': 50, 'model__warm_start': True, 'model__penalty': 'l1', 'model__max_iter': 1000, 'model__learning_rate': 'optimal', 'model__l1_ratio': 0.4, 'model__eta0': 0.0001, 'model__alpha': 0.001} 0.6590673329805824


In [49]:
MI_train, MI_test = SelectKBest(mutual_info_classif, k = 50).fit_transform(svm_train, Y_SVM_train), SelectKBest(mutual_info_classif, k = 50).fit_transform(svm_test, Y_SVM_test)
SGD_LOG(MI_train, Y__SVM_train, MI_test, Y__SVM_test, params = [0.001, 0.0001, 0.4, 1000, "optimal", "l1"])

In [246]:
# RFE - Orginal model
estimator = SGDClassifier(loss='hinge', shuffle=True, penalty = "l1", n_jobs = -1, random_state = 42, alpha = 0.0001,
                    learning_rate="optimal", warm_start = True, eta0 = 0.1, l1_ratio= 0.1, max_iter= 1000)
selector = RFECV(estimator, step = 1, cv = KF, n_jobs = -1, scoring = "f1").fit(scaler(svm_train), Y_SVM_train)
print(selector.n_features_, selector.ranking_)

3 [154  15  48  18  47  46   5  10   8  30  29  53  52  55  59  54  72  17
  74  73  16  82  83  85  86  66  27  87  12  24  99 102 105 106 104  13
 108 110  56 112   7  50 113  22  25 120  49  35 119 117   9  14  63 130
  65 143 129  19 135 136 141 142 138 139  67  76  88  90  92  94  23 122
 124   6 146  20  11 152 153 151  21  78  77  96  93  97  98  75 103 101
 100  80   1   2   3   1   1  40 107  41 115  84  79 116  69  36   4  57
  34  31  28  45 140  51 132  38 123 144  81  42  44  91 109  95 133  43
  89 148 114 118 131  39 147 145  70  26 121 111  32  33 134  58  60  61
  62  64  68  71  37 125 126 127 128 137 149 150]


In [248]:
RFE = pd.DataFrame(selector.support_, index = svm_train.columns).rename(columns = {0:"selected"})  
RFE_train, RFE_test = svm_train[RFE[RFE.selected == True].index], svm_test[RFE[RFE.selected == True].index]
SGD_SVM(RFE_train, Y_SVM_train, RFE_test, Y_SVM_test, params = [0.001, 0.0001, 0.4, 1000, "optimal", "l1"])

(array([0.60842158, 0.60805373, 0.60562561, 0.61390492, 0.60021091]),
 0.6072568908367504,
 0.5453275847710077,
 array([[30255, 47599],
        [23374, 54869]]),
 array([0.60195185, 0.60874049, 0.61277419, 0.59785696, 0.61006733]),
 0.6062913864542862,
 0.5440739024499619,
 array([[12913, 20731],
        [ 9770, 23485]]),
 0.14113712310791016)