In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
# import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn import base
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFE


# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import BayesianRidge

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Utility
import os
import time
import random
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")
from IPython.display import Image
# import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from tensorflow import keras

# from bayes_opt import BayesianOptimization

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
num_features_train = pd.read_csv(os.path.abspath("../input")+"/choi_num_features_train.csv" , encoding = 'utf-8')
num_features_test = pd.read_csv(os.path.abspath("../input")+"/choi_num_features_test.csv" , encoding = 'utf-8')

In [3]:
onehot_features_train = pd.read_csv(os.path.abspath("../input")+'/choi_onehot_features_train.csv' , encoding = 'utf-8')
onehot_features_test = pd.read_csv(os.path.abspath("../input")+'/choi_onehot_features_test.csv' , encoding = 'utf-8')

In [4]:
w2v_features_train = pd.read_csv(os.path.abspath("../input")+'/choi_w2v_features_train.csv' , encoding = 'utf-8')
w2v_features_test = pd.read_csv(os.path.abspath("../input")+'/choi_w2v_features_test.csv' , encoding = 'utf-8')

In [5]:
kmean_train_num = pd.read_csv(os.path.abspath("../input")+'/choi_features_k_train_num.csv' , encoding = 'utf-8')
kmean_test_num = pd.read_csv(os.path.abspath("../input")+'/choi_features_k_test_num.csv' , encoding = 'utf-8')
kmean_train = pd.read_csv(os.path.abspath("../input")+'/choi_features_k_train.csv' , encoding = 'utf-8')
kmean_test = pd.read_csv(os.path.abspath("../input")+'/choi_features_k_test.csv' , encoding = 'utf-8')

## *numeric scaling*

In [6]:
train_id = num_features_train['custid'].unique()
tst_id = num_features_test['custid'].unique()

In [7]:
num_features_train = num_features_train.drop(columns = 'custid')
num_features_test = num_features_test.drop(columns = 'custid')

In [8]:
train_features = [c for c in num_features_train.columns.tolist()]
test_features = [c for c in num_features_test.columns.tolist()]

In [9]:
standardscaler = StandardScaler()

In [10]:
X_train_ss = standardscaler.fit_transform(num_features_train)
X_train_ss = pd.DataFrame(X_train_ss, columns=train_features)

In [11]:
X_test_ss = standardscaler.fit(num_features_train).transform(num_features_test)
X_test_ss = pd.DataFrame(X_test_ss, columns=test_features)

In [12]:
X_train_ss = pd.concat([pd.DataFrame({'custid':train_id}),X_train_ss],axis = 1)
X_test_ss = pd.concat([pd.DataFrame({'custid':tst_id}),X_test_ss],axis = 1)

***

# 동일 컬럼명 제거

In [13]:
kmean_train_num.columns = kmean_train_num.columns.map(lambda x : "km_n_" + str(x))
kmean_test_num.columns = kmean_test_num.columns.map(lambda x : "km_n_" + str(x))
kmean_train.columns = kmean_train.columns.map(lambda x : "km_" + str(x))
kmean_test.columns = kmean_test.columns.map(lambda x : "km_" + str(x))

***

In [14]:
all_features_train =  pd.concat([X_train_ss, onehot_features_train, w2v_features_train, kmean_train_num, kmean_train], axis = 1)
all_features_test =  pd.concat([X_test_ss, onehot_features_test, w2v_features_test, kmean_test_num, kmean_test], axis = 1)

In [15]:
target = pd.read_csv(os.path.abspath("../input")+'/y_train.csv' , encoding = 'cp949').group

In [16]:
# lgbm 에서 json 오류 방지를 위해 특수문자 , 제거

import re
all_features_train.rename(columns = lambda x:re.sub(',', '/', x), inplace = True)
all_features_test.rename(columns = lambda x:re.sub(',', '/', x), inplace = True)

In [17]:
all_features_train = all_features_train.fillna(0.00001)
all_features_test = all_features_test.fillna(0.00001)

***

In [19]:
all_features_train.columns = all_features_train.columns.astype(str)
all_features_test.columns = all_features_test.columns.astype(str)

In [20]:
all_features_train = all_features_train.iloc[:, 1:]
all_features_test = all_features_test.iloc[:, 1:]

***

# Feature Selection

In [21]:
clf = LGBMClassifier(random_state = 1000)

In [22]:
from sklearn.feature_selection import SelectFromModel

In [23]:
all_features_train_fs = all_features_train
all_features_test_fs = all_features_test

In [24]:
smf = SelectFromModel(clf, threshold='3.0*mean')
smf.fit(all_features_train_fs, target)

X_new = smf.transform(all_features_train_fs)
X_te_new = smf.transform(all_features_test_fs)

feature_selection_idx = smf.get_support()
feature_selection_name = all_features_train_fs.columns[feature_selection_idx]

In [25]:
all_features_train_fs.columns[feature_selection_idx]

Index(['19_x', '19_y', 'dis_rate', 'sales_time', '남성', '비남성', '남성part', '비화장품',
       '화장품_x', 'real_amt',
       ...
       'customer_info_v287', 'customer_info_v288', 'customer_info_v290',
       'customer_info_v291', 'customer_info_v293', 'customer_info_v294',
       'customer_info_v296', 'customer_info_v297', 'customer_info_v298',
       'customer_info_v299'],
      dtype='object', length=547)

Index(['19_x', '19_y', 'dis_rate', 'sales_time', '남성', '비남성', '남성part', '비화장품',
       '화장품_x', 'real_amt',
       ...
       'customer_info_v287', 'customer_info_v288', 'customer_info_v290',
       'customer_info_v291', 'customer_info_v293', 'customer_info_v294',
       'customer_info_v296', 'customer_info_v297', 'customer_info_v298',
       'customer_info_v299'],
      dtype='object', length=547)

In [26]:
X_new = pd.DataFrame(X_new)
X_te_new = pd.DataFrame(X_te_new)

X_new.columns = feature_selection_name
X_te_new.columns = feature_selection_name

In [27]:
X_new.to_csv("choi_select_547_train.csv",index = False,encoding = 'utf-8')
X_te_new.to_csv("choi_select_547_test.csv",index = False,encoding = 'utf-8')

In [28]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_new, target, scoring='neg_log_loss', cv=5)
print('교차 검증별 정확도:', np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))
print('log_loss:', np.mean(scores))

교차 검증별 정확도: [-1.5083 -1.5382 -1.5056 -1.4992 -1.5094]
평균 검증 정확도: -1.5121
log_loss: -1.5121296472150083


In [29]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1000)
str_scores = cross_val_score(clf, X_new, target, scoring='neg_log_loss', cv=cv)
print('교차 검증별 정확도:', np.round(str_scores, 4))
print('평균 검증 정확도:', np.round(np.mean(str_scores), 4))
print('log_loss:', np.mean(str_scores))

교차 검증별 정확도: [-1.5005 -1.5201 -1.5167 -1.4976 -1.528 ]
평균 검증 정확도: -1.5126
log_loss: -1.512588286895974


***