In [1]:
global_parameters = {
    # 날짜별 에러 정보를 유저 아이디로 취합할 때 사용할 quantile 값
    'quantile': [0.10, 0.25, 0.35, 0.75, 0.80, 0.90],
    
    # LGBM 앙상블 학습시 사용할 fold 개수
    'nfold': 10,
    
    # STMOE 알고리즘에 대해 멀티 프로세싱을 사용할지 여부, (workers.py 파일이 존재해야함)
    'multiprocessing_for_smote': True,
    
    # 데이터 세트에 x개 이상 정보가 에러타입_코드 칼럼만 사용
    'min_errcode_sample': 50,
    
    # pearson 상관계수를 이용하여 x 값 이상인 칼럼을 제거함 (1을 초과할 경우 작동하지 않음)
    'pearson_cutoff': 2,
    
    # lgbm에서 seed를 변경하여 앙상블 모델의 다양성 추가 (기본값1)
    'lgbm_seed_ensemble': 1,
}

# 1. 데이터 로드 및 전처리

In [5]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
import collections

warnings.filterwarnings(action='ignore')

PATH = 'data/'
def make_days(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    return (dt.date(year, month, day) - dt.date(2020, 10, 31)).days

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^\-0-9\.]+", '', str(x))
    if x =='':
        return -1
    else:
        return int(float(x))

## 데이터 불러오기

In [6]:
train_err = pd.read_csv(PATH+'train_err_data.csv')
train_err['days'] = train_err['time'].apply(make_days)
test_err = pd.read_csv(PATH+'test_err_data.csv')
test_err['days'] = test_err['time'].apply(make_days)

In [7]:
# 문자열(ex "1,000")로 되어있는 퀄리티 데이터 float 타입으로 변환
train_quality = pd.read_csv(PATH+'train_quality_data.csv')
for i in range(0,13,1):
    train_quality['quality_' + str(i)] = train_quality['quality_' + str(i)].apply(lambda x: string2num(x)).astype(float)
train_quality['days'] = train_quality['time'].apply(make_days)

test_quality = pd.read_csv(PATH+'test_quality_data.csv')
for i in range(0,13,1):
    test_quality['quality_' + str(i)] = test_quality['quality_' + str(i)].apply(lambda x: string2num(x)).astype(float)
test_quality['days'] = test_quality['time'].apply(make_days)

In [None]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

In [43]:
train_quality['problem'] = 0
for i, rows in train_prob.groupby('user_id'):
    if len(train_quality.loc[train_quality['user_id'] == i]) == 0:
        continue
        
    for j, row in rows.iterrows():
        c = train_quality.loc[(train_quality['user_id'] == i) & (train_quality['time'] < row['time'])]

        if len(c) == 0: # 불만 접수 전 발생할 퀄리티 데이터가 없음
            continue

        if (len(train_quality.loc[train_quality['user_id'] == i, 'fwver'].unique()) == 1):
            train_quality.loc[train_quality['user_id'] == i, 'problem'] = 1
        else:
            
            fwver = c.iloc[len(c)-1]['fwver']
            train_quality.loc[(train_quality['user_id'] == i) & (train_quality['fwver'] == fwver), 'problem'] = 1

In [69]:
re = []
for i, r in train_quality.groupby(['user_id', 'fwver', 'problem']):
    re.append(list(i) + list(r.loc[:, ['quality_' + str(i) for i in range(0,13)]].mean().values) + list(r.loc[:, ['quality_' + str(i) for i in range(0,13)]].std().values))
re = pd.DataFrame(re, columns= ['user_id', 'fwver', 'problem'] + ['mean_quality_' + str(i) for i in range(0,13)] + ['std_quality_' + str(i) for i in range(0,13)])

## 퀄리티 데이터로부터 feature 생성

- 퀄리티 정보를 [유저 아이디, 날짜]로 그룹하고 median을 이용해 대표값만 남김
    - shape: (user_id, days, quality_columns)
- 해당 데이터를 다시 유저 아이디별로 통계를 만듦.
    - quality_count: 퀄리티 로그가 발생한 날짜 수
    - quality_sum: 퀄리티 로그의 합 (상황에 따라 sum/count를 이용하여 평균값을 구할 수 있음)
    - quality_50: 퀄리티 로그의 median 값

In [71]:
train_full = re.loc[:, re.columns.difference(['user_id'])]

In [72]:
train_full['fwver'] = train_full['fwver'].astype('category')

In [74]:
#train_full 데이터를 train, valid로 나눔 (valid는 점수 검증용으로)
def make_train_val(data, r=0.8):
    min_user_id = data.index.min()
    max_user_id = data.index.max()
    m = int((1 - r) * min_user_id + r * max_user_id)
    return data.loc[data.index < m], data.loc[data.index >= m]

train, valid = make_train_val(train_full)

In [75]:
def reset_bad_features():
    global bad_features
    bad_features = []

def add_bad_features(col):
    global bad_features
    bad_features.append(col)
    bad_features = list(set(bad_features))
    
reset_bad_features()

# 2. 모델 생성

- LGBM 모델을 사용하여 주어진 데이터를 학습함.
- 앙상블 학습을 통해 예측 성능 향상
- StratifiedKFold를 사용하여 valid set을 만들때 label 비율 유지
- training data에 대해서는 SMOTENC라는 oversampling 기법을 사용하여 데이터의 불균형을 해소
    - SMOTENC는 SMOTE에서 범주형 데이터를 사용할 수 있도록 수정된 알고리즘
- 학습에 충분한 시간이 있다면 global_parameters['lgbm_seed_ensemble']의 값을 늘려 seed의 다양성을 주어 성능을 향상시킬 수 있음

In [76]:
def reset_fold_cache():
    global fold_cache
    fold_cache = {}
reset_fold_cache()

In [77]:
# LGBM + Soft voting ensemble model (sklearn interface)
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from multiprocessing import Pool 
import workers

class LGBMEnsembleModel(object):
    def __init__(self, params, smote=True, folds=10, early_stopping_rounds=200, random_state=42):
        self.early_stopping_rounds = early_stopping_rounds
        self.params = params
        self.smote = smote
        self.folds = folds
        self.random_state = random_state
        self.models = []
        self.cv_scores = []
            
    def get_params(self, deep=True):
        return {'params': self.params, 'folds': self.folds, 'random_state': self.random_state, 'early_stopping_rounds': self.early_stopping_rounds}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def fit(self, X, y):
        self._default_params()
        self.models = []
        self.cv_scores = []
        
        datas = []
        original_seed = self.random_state
        for i in range(0, global_parameters['lgbm_seed_ensemble']):
            datas += self.load_fold_cache(X, y)
            self.random_state += 10000
        self.random_state = original_seed
        
        for i in datas:
            train_x = i['train_x']
            train_y = i['train_y']
            valid_x = i['valid_x']
            valid_y = i['valid_y']
            d_train = lgb.Dataset(train_x, train_y, silent=True, free_raw_data=False, params={'verbose': -1})
            d_val  = lgb.Dataset(valid_x, valid_y, silent=True, free_raw_data=False, params={'verbose': -1})
            clf = lgb.train(params=self.params, train_set=d_train, valid_sets=[d_train, d_val], verbose_eval=False, early_stopping_rounds=self.early_stopping_rounds)
            val_pred = clf.predict(valid_x)
            auc_score = roc_auc_score(valid_y, val_pred)
            self.cv_scores.append(auc_score)
            self.models.append(clf)
        return self
    
    def predict(self, X):
        y_pred = []
        for model in self.models:
            y_pred.append(model.predict(X))
            
        pred_ensemble = np.mean(y_pred, axis = 0)
        return pred_ensemble

    def score(self, X, y):
        val_pred = self.predict(X)
        auc_score = roc_auc_score(y, val_pred)
        return auc_score
    
    def _default_params(self):
        int_params = ['max_depth', 'num_leaves', 'num_iterations', 'min_data_in_leaf', 'max_bin', 'min_data_in_bin', 'n_estimators']
        for i in int_params:
            if i in self.params:
                self.params[i] = int(self.params[i])
            
        self.params['seed'] = 1015
        self.params['metric'] = 'auc'
        self.params['objective'] = 'binary'
        self.params['num_iterations'] = 1000000
        self.params['verbose'] = -1
        self.params['num_threads'] = 20
        
    def load_fold_cache(self, X, y):
        global fold_cache
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
            
        if isinstance(y, np.ndarray):
            y = pd.DataFrame(y)
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        hashcode = str(self.random_state) + str(X.shape) + str(self.folds) + str(X.columns) + str(X.iloc[0]) + pd.util.hash_pandas_object(X).astype(str).sum()
        if hashcode not in fold_cache:
            categorical_features = [X.columns.get_loc(i) for i in X.select_dtypes(exclude=['int', 'int64', 'float']).columns]
            output = []
            folds = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=self.random_state)
            if global_parameters['multiprocessing_for_smote']:
                task = []
                for train_idx, val_idx in folds.split(X, y):
                    train_x = X.loc[train_idx]
                    train_y = y.loc[train_idx]
                    valid_x = X.loc[val_idx]
                    valid_y = y.loc[val_idx]
                    if len(categorical_features) > 0:
                        smote = SMOTENC(random_state=self.random_state, categorical_features = categorical_features, n_jobs = 20)
                    else:
                        smote = SMOTE(random_state=self.random_state, n_jobs = 20)
                    task.append(({'train_x': train_x,'train_y': train_y, 'valid_x': valid_x, 'valid_y': valid_y}, smote))
                p=Pool(processes = 10) 
                output = p.map(workers.worker,task)
            else:
                for train_idx, val_idx in folds.split(X, y):
                    train_x = X.loc[train_idx]
                    train_y = y.loc[train_idx]
                    valid_x = X.loc[val_idx]
                    valid_y = y.loc[val_idx]
                    smote = SMOTENC(random_state=self.random_state, categorical_features = categorical_features, n_jobs = 20)
                    train_x, train_y = smote.fit_sample(train_x, train_y)
                    output.append({'train_x': train_x,'train_y': train_y, 'valid_x': valid_x, 'valid_y': valid_y})
            fold_cache[hashcode] = output
        return fold_cache[hashcode]

In [78]:
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance

def lgbm_train(data, params = {}, valid=valid):
    if 'boosting_list' in params:
        temp = params['boosting_list']
        for i in temp.keys():
            params[i] = temp[i]
        del(params['boosting_list'])

    x = data[data.columns.difference(['problem'] + bad_features)]
    y = data['problem']
    
    test_x = valid.loc[:, valid.columns.difference(['problem'] + bad_features)]
    test_y = valid['problem']
    clf = LGBMEnsembleModel(params, folds=global_parameters['nfold'])

    clf.fit(x,y)
    return {'model': clf, 'auc': clf.cv_scores, 'test auc': clf.score(test_x, test_y)}

# 3. 스코어 확인용 함수 생성

- 앙상블 러닝에서 이미 kfold를 사용하였기 때문에, CV를 한번 더 적용한 중첩 k-fold를 이용하여 모델 성능을 측정함

In [79]:
def global_score(params = {}, use_tqdm = True):
    scores = []
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if use_tqdm:
        for train_idx, val_idx in tqdm(folds.split(train_full, train_full['problem']), leave=False):
            scores.append(lgbm_train(train_full.loc[train_idx].reset_index(), params=params, valid=train_full.loc[val_idx].reset_index())['test auc'])
    else:
        for train_idx, val_idx in folds.split(train_full, train_full['problem']):
            scores.append(lgbm_train(train_full.loc[train_idx].reset_index(), params=params, valid=train_full.loc[val_idx].reset_index())['test auc'])
    return np.mean(scores)

In [80]:
# 기본 성능 테스트
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.7018695729222046, 0.6594443365752918, 0.6576659664978024, 0.6542195470672338, 0.7262854570808255, 0.668490635976912, 0.6594090339434804, 0.6601570820021299, 0.6590122470713526, 0.7092385516506923]
deafult Test - AUC 0.6778045858225701


                       

global AUC 0.6810945660991767




In [81]:
from eli5.sklearn import PermutationImportance
clf = LGBMEnsembleModel(params = {}, folds=global_parameters['nfold'])
train_full_x = train_full[train_full.columns.difference(['problem', 'fwver'] + bad_features)]
train_full_y = train_full['problem']
perm = PermutationImportance(clf, random_state = 42, n_iter = 10, cv=10).fit(train_full_x, train_full_y)
perm_df = pd.DataFrame()
perm_df['column'] = train_full_x.columns
perm_df['score'] = perm.feature_importances_
perm_df['score_std'] = perm.feature_importances_std_


In [82]:
eli5.show_weights(perm, top = 50, feature_names = train_full_x.columns.tolist())

Weight,Feature
0.0352  ± 0.0435,std_quality_0
0.0245  ± 0.0358,mean_quality_0
0.0139  ± 0.0341,std_quality_11
0.0110  ± 0.0259,mean_quality_1
0.0095  ± 0.0212,mean_quality_11
0.0094  ± 0.0176,std_quality_8
0.0044  ± 0.0161,mean_quality_10
0.0040  ± 0.0087,mean_quality_8
0.0038  ± 0.0179,std_quality_2
0.0036  ± 0.0120,std_quality_10
