In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA


# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn import base
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import optuna

# Modeling
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier


# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

# Utility
import os
import time
import random
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")
from IPython.display import Image

# import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from tensorflow import keras

# from bayes_opt import BayesianOptimization

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
# 피쳐 불러오기
num_features_train = pd.read_csv('../input/hwang_num_features_train.csv')
num_features_test = pd.read_csv('../input/hwang_num_features_test.csv')
onehot_features_train = pd.read_csv('../input/hwang_onehot_features_train.csv', encoding='cp949')
onehot_features_test = pd.read_csv('../input/hwang_onehot_features_test.csv', encoding='cp949')
w2v_features_train = pd.read_csv('../input/hwang_w2v_features_train.csv', encoding='cp949')
w2v_features_test = pd.read_csv('../input/hwang_w2v_features_test.csv', encoding='cp949')

In [3]:
x_te = pd.read_csv('../input/X_test.csv', encoding='cp949')

In [4]:
tst_id = x_te['custid'].unique()

In [5]:
target = pd.read_csv('../input/y_train.csv', encoding = 'cp949').group

In [6]:
num_features_train.shape

(21587, 3293)

In [7]:
onehot_features_train.shape

(21587, 2309)

In [8]:
w2v_features_train.shape

(21587, 1950)

# Merge All Features

In [9]:
all_features_train =  pd.concat([num_features_train, onehot_features_train, w2v_features_train], axis = 1)
all_features_test =  pd.concat([num_features_test, onehot_features_test, w2v_features_test], axis = 1)

In [10]:
import re
all_features_train.rename(columns = lambda x:re.sub(',', '/', x), inplace = True)
all_features_test.rename(columns = lambda x:re.sub(',', '/', x), inplace = True)



In [11]:
all_features_train = all_features_train.iloc[:, 1:]
all_features_test = all_features_test.iloc[:, 1:]

In [12]:
print(all_features_train.shape, all_features_test.shape, target.shape)

(21587, 7551) (14380, 7551) (21587,)


# Feature Selection

### LGBM

In [13]:
clf = LGBMClassifier(random_state = 0)

In [14]:
all_features_train_fs = all_features_train
all_features_test_fs = all_features_test

In [15]:
smf = SelectFromModel(clf, threshold='3.0*mean')
smf.fit(all_features_train_fs, target)

SelectFromModel(estimator=LGBMClassifier(random_state=0), threshold='3.0*mean')

In [16]:
X_new = smf.transform(all_features_train_fs)
X_te_new = smf.transform(all_features_test_fs)

In [17]:
feature_selection_idx = smf.get_support()
feature_selection_name = all_features_train_fs.columns[feature_selection_idx]

In [18]:
X_new = pd.DataFrame(X_new)
X_te_new = pd.DataFrame(X_te_new)

X_new.columns = feature_selection_name
X_te_new.columns = feature_selection_name

In [19]:
scores = cross_val_score(clf, X_new, target, scoring='neg_log_loss', cv=5)
print('교차 검증별 정확도:', np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))
print('log_loss:', -np.mean(scores))

교차 검증별 정확도: [-1.5497 -1.5727 -1.558  -1.5494 -1.5672]
평균 검증 정확도: -1.5594
log_loss: 1.559393426848895


In [20]:
X_new.to_csv('hwang_selected_features_train.csv', index=False)
X_te_new.to_csv('hwang_selected_features_test.csv', index=False)

***