In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import ( 
  BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier
)

import xgboost as xgb
import lightgbm as lgb
# import catboost as cb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, f1_score, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE, RFECV

import eli5
from eli5.sklearn import PermutationImportance
import shap


In [2]:
train = pd.read_csv('C:\\Users\\Master\\Desktop\\snb edu_codes\\5. SVM\\train.csv')
train

Unnamed: 0,id,target,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,0,1.0,-1.067,-1.114,-0.616,0.376,1.090,0.467,-0.422,0.460,...,0.220,-0.339,0.254,-0.179,0.352,0.125,0.347,0.436,0.958,-0.824
1,1,0.0,-0.831,0.271,1.716,1.096,1.731,-0.197,1.904,-0.265,...,-0.765,-0.735,-1.158,2.554,0.856,-1.506,0.462,-0.029,-1.932,-0.343
2,2,0.0,0.099,1.390,-0.732,-1.065,0.005,-0.081,-1.450,0.317,...,-1.311,0.799,-1.001,1.544,0.575,-0.309,-0.339,-0.148,-0.646,0.725
3,3,1.0,-0.989,-0.916,-1.343,0.145,0.543,0.636,1.127,0.189,...,-1.370,1.093,0.596,-0.589,-0.649,-0.163,-0.958,-1.081,0.805,3.401
4,4,0.0,0.811,-1.509,0.522,-0.360,-0.220,-0.959,0.334,-0.566,...,-0.178,0.718,-1.017,1.249,-0.596,-0.445,1.751,1.442,-0.393,-0.643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245,1.0,-0.068,-0.184,-1.153,0.610,0.414,1.557,-0.234,0.950,...,1.492,1.430,-0.333,-0.200,-1.073,0.797,1.980,1.191,1.032,-0.402
246,246,0.0,-0.234,-1.373,-2.050,-0.408,-0.255,0.784,0.986,-0.891,...,-0.996,0.678,1.395,0.714,0.215,-0.537,-1.267,-1.021,0.747,0.128
247,247,0.0,-2.327,-1.834,-0.762,0.660,-0.858,-2.764,-0.539,-0.065,...,-1.237,-0.620,0.670,-2.010,0.438,1.972,-0.379,0.676,-1.220,-0.855
248,248,1.0,-0.451,-0.204,-0.762,0.261,0.022,-1.487,-1.122,0.141,...,0.729,0.411,2.366,-0.021,0.160,0.045,0.208,-2.117,-0.546,-0.093


In [3]:
features = train.loc[:, '0':'299']
target = train['target']

x_train, x_test, y_train, y_test = train_test_split( features, target, test_size=0.2, shuffle=True)

In [4]:
pipeline = Pipeline([
  ('logit', LogisticRegression())
])

pipeline.fit(x_train, y_train)
yhat = pipeline.predict(x_test)
display( roc_auc_score(y_test, yhat) )

0.6448412698412699

In [5]:
pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('logit', LogisticRegression())
])

pipeline.fit(x_train, y_train)
yhat = pipeline.predict(x_test)
display( roc_auc_score(y_test, yhat) )

0.6448412698412699

In [6]:
pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('logit', LogisticRegression())
])

folds = RepeatedStratifiedKFold(n_splits=20, n_repeats=5)
scores = cross_val_score(pipeline, x_train, y_train, cv=folds, scoring='roc_auc')
display( scores.mean() )

pipeline.fit(x_train, y_train)
yhat = pipeline.predict(x_test)
display( roc_auc_score(y_test, yhat) )

0.6373214285714285

0.6448412698412699

In [7]:
folds = StratifiedKFold(n_splits=20, shuffle=True)

In [8]:
model = LogisticRegression(class_weight='balanced', solver='liblinear')
feature_selector = RFECV(model, min_features_to_select=20, cv=folds)
feature_selector = feature_selector.fit(x_train, y_train)

In [9]:
pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('logit', LogisticRegression(class_weight='balanced', solver='liblinear'))
])

x_train_selected = x_train.loc[:, feature_selector.support_ ]
x_test_selected = x_test.loc[:, feature_selector.support_ ]

scores = cross_val_score(pipeline, x_train_selected, y_train, cv=folds, scoring='roc_auc')
display( scores.mean() )

pipeline.fit(x_train_selected, y_train)
yhat = pipeline.predict(x_test_selected)
display( roc_auc_score(y_test, yhat) )

0.7949404761904761

0.7103174603174602

In [10]:
pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('logit', LogisticRegression())
])

x_train_selected = x_train.loc[:, feature_selector.support_ ]
x_test_selected = x_test.loc[:, feature_selector.support_ ]

params = {
    'logit__penalty': ['l1', 'l2'],
    'logit__class_weight': ['balanced', None],
    'logit__C': [ 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'logit__solver':['liblinear']
}

search = GridSearchCV(pipeline, param_grid=params, scoring='roc_auc', cv=folds)
results = search.fit(x_train_selected, y_train)
display( results.best_params_ )
display( results.best_score_ )

{'logit__C': 1000,
 'logit__class_weight': None,
 'logit__penalty': 'l2',
 'logit__solver': 'liblinear'}

0.8241071428571429

In [11]:
pipeline.set_params(logit__penalty='l2', logit__C=10, logit__class_weight=None, logit__solver='liblinear')
pipeline.fit( x_train_selected, y_train )
yhat = pipeline.predict( x_test_selected )
display( roc_auc_score( y_test, yhat) )

0.6170634920634921