In [1]:
global_parameters = {
    # 날짜별 에러 정보를 유저 아이디로 취합할 때 사용할 quantile 값
    'quantile': [0.10, 0.25, 0.35, 0.75, 0.80, 0.90],
    
    # LGBM 앙상블 학습시 사용할 fold 개수
    'nfold': 10,
    
    # STMOE 알고리즘에 대해 멀티 프로세싱을 사용할지 여부, (workers.py 파일이 존재해야함)
    'multiprocessing_for_smote': True,
    
    # 데이터 세트에 x개 이상 정보가 에러타입_코드 칼럼만 사용
    'min_errcode_sample': 50,
    
    # pearson 상관계수를 이용하여 x 값 이상인 칼럼을 제거함 (1을 초과할 경우 작동하지 않음)
    'pearson_cutoff': 2,
    
    # lgbm에서 seed를 변경하여 앙상블 모델의 다양성 추가 (기본값1)
    'lgbm_seed_ensemble': 1,
}

# 1. 데이터 로드 및 전처리

In [2]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
import collections

warnings.filterwarnings(action='ignore')

PATH = '../data/'
def make_days(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    return (dt.date(year, month, day) - dt.date(2020, 10, 31)).days

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^\-0-9\.]+", '', str(x))
    if x =='':
        return -1
    else:
        return int(float(x))

## 데이터 불러오기

In [3]:
train_err = pd.read_csv(PATH+'train_err_data.csv')
train_err['days'] = train_err['time'].apply(make_days)
test_err = pd.read_csv(PATH+'test_err_data.csv')
test_err['days'] = test_err['time'].apply(make_days)

In [4]:
# 문자열(ex "1,000")로 되어있는 퀄리티 데이터 float 타입으로 변환
train_quality = pd.read_csv(PATH+'train_quality_data.csv')
for i in range(0,13,1):
    train_quality['quality_' + str(i)] = train_quality['quality_' + str(i)].apply(lambda x: string2num(x)).astype(float)
train_quality['days'] = train_quality['time'].apply(make_days)

test_quality = pd.read_csv(PATH+'test_quality_data.csv')
for i in range(0,13,1):
    test_quality['quality_' + str(i)] = test_quality['quality_' + str(i)].apply(lambda x: string2num(x)).astype(float)
test_quality['days'] = test_quality['time'].apply(make_days)

## 퀄리티 데이터로부터 feature 생성

- 퀄리티 정보를 [유저 아이디, 날짜]로 그룹하고 median을 이용해 대표값만 남김
    - shape: (user_id, days, quality_columns)
- 해당 데이터를 다시 유저 아이디별로 통계를 만듦.
    - quality_count: 퀄리티 로그가 발생한 날짜 수
    - quality_sum: 퀄리티 로그의 합 (상황에 따라 sum/count를 이용하여 평균값을 구할 수 있음)
    - quality_50: 퀄리티 로그의 median 값

In [5]:
# [user_id, group]
quality_user_id = {}
for i in range(10000,45000):
    quality_user_id[i] = []
    for j in range(0,13):
        quality_user_id[i].append([])
        
for i, group in train_quality[train_quality.columns.difference(['time'])].groupby(['user_id']):
    for j in range(0,13):
        quality_user_id[i][j] = group.loc[group['quality_'+str(j)] != -1]['quality_'+str(j)].values
        
for i, group in test_quality[test_quality.columns.difference(['time'])].groupby(['user_id']):
    for j in range(0,13):
        quality_user_id[i][j] = group.loc[group['quality_'+str(j)] != -1]['quality_'+str(j)].values
        
quality_columns_preset =  ['quality_count'] + ['quality_sum'] + ['quality_mean']  + ['quality_std'] + ['quality_max']
quality_user_id_statistics = {}
for i in quality_user_id:
    quality_user_id_statistics[i] = []
    for j in range(0,13):
        a = np.array(quality_user_id[i][j])
        if (len(a) == 0):
            quality_user_id_statistics[i] += [0] + [-999] * (len(quality_columns_preset) - 1)
        else: 
            quality_user_id_statistics[i] += [len(a), a.sum(axis=0), a.mean(axis=0), a.std(axis=0), a.max(axis=0)]
quality_columns = []
quality_columns_index = {}
for i in range(0,13):
    for j in quality_columns_preset:
        # 어차피 이미 만들어진 퀄리티 데이터에 접근하지 않으니 index가 필요없음
        # quality_columns_index[column_name] = len(quality_columns)
        quality_columns.append(j + '_' + str(i))

In [6]:
quality_list = {
    '05.15.2138': [['1', 'std'], ['5', 'mean'], ['5', 'max'], 
                   ['6', 'mean'], ['6', 'std'], ['7', 'mean'],
                   ['7', 'max'], ['7', 'std'], ['10', 'min'],
                   ['10', 'mean'], ['10', 'std']],
    
    '04.22.1750': [['1', 'max'], ['1', 'std'], ['5', 'mean'], 
                   ['5', 'max'], ['5', 'std'], ['6', 'mean'],
                   ['7', 'min'], ['7', 'mean'], ['8', 'std'],
                   ['10', 'mean'], ['10', 'max'], ['10', 'std']],
                  
    
    '04.33.1261': [['7', 'max'], ['7', 'std'], ['10', 'min'], 
                   ['10', 'mean'], ['10', 'max'], ['12', 'std']],
    
    '04.16.3553': [['1', 'max'], ['5', 'max'], ['6', 'mean'], 
                   ['7', 'mean'], ['7', 'std'], ['8', 'max'],
                  ['10', 'max'], ['10', 'std'], ['12', 'max']],
                   
    '03.11.1167': [['1', 'max'], ['5', 'mean'], ['5', 'std'], 
                   ['7', 'min'], ['8', 'min'], ['8', 'max'],
                  ['8', 'std'], ['10', 'mean'], ['11', 'std']],
                  
    '04.33.1185': [['5', 'mean'], ['5', 'max'], ['5', 'std'], 
                   ['7', 'std'], ['8', 'std'], ['10', 'min'],
                  ['10', 'mean'], ['10', 'max'], ['10', 'std'],
                  ['11', 'max'], ['12', 'mean'], ['12', 'std']],
    
    
    '04.22.1778': [['6', 'max'], ['7', 'max'], ['7', 'std'], 
                   ['10', 'max'], ['11', 'max'], ['12', 'mean'],
                  ['12', 'max'], ['12', 'std']]
}
'''
for fwver in list(quality_list.keys()):
    quality_list[fwver] = []
    for i in range(0,13):
        for j in ['min', 'mean', 'max', 'std']:
            quality_list[fwver].append([str(i),j])

quality_columns = []
quality_columns_index = {}
'''
old = len(quality_columns)
for fwver in quality_list.keys():
    for i in quality_list[fwver]:
        column_name = fwver + '_quality_' + i[0] + '_' + i[1]
        quality_columns_index[column_name] = len(quality_columns)
        quality_columns.append(column_name)
        
# quality_user_id_statistics = {}
for i in range(10000,45000):
    quality_user_id_statistics[i] += [-999] * (len(quality_columns) - old)

for table in [train_quality, test_quality]:
    for fwver in quality_list.keys():
        data = table.loc[train_quality['fwver'] == fwver]
        for user_id, group in data.groupby('user_id'):
            for i in quality_list[fwver]:
                column_name = fwver + '_quality_' + i[0] + '_' + i[1]
                quality_name = 'quality_' + i[0]
                statistics = i[1]
                value = 0
                if statistics == 'min':
                    value = group[quality_name].min()
                if statistics == 'mean':
                    value = group[quality_name].mean()
                if statistics == 'max':
                    value = group[quality_name].max()
                if statistics == 'std':
                    value = group[quality_name].std()
                quality_user_id_statistics[user_id][quality_columns_index[column_name]] = value


## 에러 데이터로부터 feature 생성
- 에러 타입과 에러 코드를 조합하여 칼럼을 생성하였음. A_B = 에러(타입 A 이면서 코드는 B)
- **날짜별로** 에러 발생 횟수를 카운트 -> (user_id, days, errtype_code)
- **유저별로 에러 발생 횟수에 대한 통계를 적용 -> (user_id, days, statistics_errtype_code)**
    - sum_A_B: 데이터 수집 기간동안 A_B가 발생한 횟수 
    - std_A_B: 데이터 수집 기간동안 A_B가 발생한 횟수의 표준 편차 (값이 높을수록 에러 발생 빈도가 급격하게 변화함)
    - max_A_B: 데이터 수집 시간중 A_B가 가장 많이 발생했던 날의 에러 수.
    - quantileX_A_B: 데이터 수집 시간중 A_B가 X일 이상 최소 value만큼 등장함
        - quantile10_A_B의 값이 2인 경우 A_B 에러가 2회 이상 발생한 경우가 30일 이상임: 모든 날에 걸처 에러가 지속적으로 나타남을 의미함
        - quantile90_A_B의 값이 17인 경우 A_B 에러가 17회 이상 발생한 경우가 3일 이상임: 에러 발생이 순간적으로 많아진 날들에 대해 얼마나 많아졌었는지를 의미함
    - used_days: 에러가 한건이라도 있는 날은 디바이스를 사용한 날이라고 판단하여, 총 몇일동안 디바이스를 사용하였는지 추측
    - model_nm(categorical columns): 해당 유저가 자주 사용한 디바이스
        
- 일부 에러는 오후 11시 ~ 오전 1시처럼 두 날에 걸쳐서 발생하는 경우도 있음. 단순히 날짜별 통계로는 이러한 케이스를 구분할 수 없기때문에 이틀씩 데이터를 연결시킴
    - 1일 데이터 + 2일 데이터
    - 2일 데이터 + 3일 데이터
    - 3일 데이터 + 4일 데이터
    ...

In [7]:
# 33일 넘기는 테스트 데이터 제거
test_err = test_err.loc[test_err['days'] < 33]

In [8]:
# train과 test에 공통적으로 사용되는 에러 코드만 남기기
train_value_count = train_err['errcode'].value_counts()
test_value_count = test_err['errcode'].value_counts()
same_error_code = set(train_value_count.keys()).intersection(set(test_value_count.keys()))
total_value_count = {}
for i in same_error_code.copy():
    value = 0
    if i in train_value_count.keys():
        value += train_value_count[i]
    if i in test_value_count.keys():
        value += test_value_count[i]
    if (value < global_parameters['min_errcode_sample'] * 2):
        same_error_code.remove(i)

In [9]:
# error_type, error_code 매핑
def key_typecode(t,c = None):
    if c == None:
        return "errcode_" + str(t).strip()
    
    return str(t).strip() + '_' + str(c).strip()

columns = []
column_index = {}
count = 0
for i in tqdm(range(0,42)):
    for j in same_error_code:
        key = key_typecode(i, j)
        if key not in columns:
            columns.append(key)
            column_index[key] = count
            count += 1
        key = key_typecode(i)
        if key not in columns:
            columns.append(key)
            column_index[key] = count
            count += 1
            
print("검색된 칼럼 수", len(columns))

100%|█████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 429.84it/s]

검색된 칼럼 수 3318





In [10]:
def remain_fw4(x):
    fw = str(x)
    if len(fw) > 5:
        return fw[0:5]
    return fw

train_err["fwver_4"]  = train_err["fwver"].apply(remain_fw4)
test_err["fwver_4"]  = test_err["fwver"].apply(remain_fw4)

In [11]:
def _ver(x): #04.30.1235
    x = x.split('.')
    if (len(x) != 3):
        return int(x[0])
    return int(x[2])

new_ver_list = {
    'model_0': 1778,
    'model_1': 3571,
    'model_2': 1261,
    'model_3': 2138,
    'model_4': 1167,
    'model_5': 1778,
    'model_6': 10,
    'model_7': 3571,
    'model_8': 2571,
}

def is_ner_ver(x):
    cutoff = new_ver_list[x['model_nm']]
    ver = _ver(x['fwver'])
    return ver >= cutoff

In [12]:
def make_err_table(table, start_user_id, size):    
    id_error = table[['user_id','errtype','errcode', 'days']].values
    error = np.zeros((size, 33, 42 + len(columns)))    
    
    weekend_error = np.zeros((size, 10, 42 + len(columns)))      
    mapping_index = {}
    for i in range(0,33):
        if (i % 7 <= 1):
            mapping_index[i] = int(i / 7) * 2 + i % 7
        else:
            mapping_index[i] = -1
            
    days = np.zeros((size, 33))
    for person_idx, err, code, day in tqdm(id_error):
        user_index = person_idx - start_user_id
        # person_idx - 10000 위치에 person_idx, errtype에 해당하는 error값을 +1
        error[user_index, day, err - 1] += 1
        if mapping_index[day] != -1:
            weekend_error[user_index, mapping_index[day], err - 1] += 1
        days[user_index, day] = 1
        key = key_typecode(err - 1, code)
        if key in column_index:
            error[user_index, day, 42 + column_index[key]] += 1
            if mapping_index[day] != -1:
                weekend_error[user_index, mapping_index[day], 42 + column_index[key]] += 1
           
        # 에러 코드별 합계를 추가하고싶으면 아래 주석 해제
        '''
        key = key_typecode(code)
        if key in column_index:
            error[person_idx - start_user_id, day_key, 42 + column_index[key]] += 1
        '''
       
    error_mix = np.zeros((size, 32, 42 + len(columns)))    
    for i in range(0,size):
        for j in range(32): #0~1, 1~2 .... 65~66
            error_mix[i][j] = np.sum([error[i][j], error[i][j+1]], axis=0)

    error_sum = np.sum(error_mix, axis=1)
    error_std = np.std(error_mix, axis=1)
    error_max = np.max(error_mix, axis=1)
    quantile_arr = np.quantile(error_mix, global_parameters['quantile'], axis=1)
    sum_days = np.sum(days, axis=1).reshape(-1,1)
    
    # weekday_error_sum = np.sum(weekday_error, axis=1)
    # weekday_error_std = np.std(weekday_error, axis=1)
    # weekday_error_max = np.max(weekday_error, axis=1)
    
    weekend_error_sum = np.sum(weekend_error, axis=1)
    weekend_error_std = np.std(weekend_error, axis=1)
    weekend_error_max = np.max(weekend_error, axis=1)
    
    total = np.concatenate([error_sum, error_std, error_max, sum_days] + [i for i in quantile_arr] + 
                           #[weekday_error_sum, weekday_error_std, weekday_error_max] +
                           [weekend_error_sum, weekend_error_std, weekend_error_max]
                           , axis=1)
    quantile_columns = []
    temp_columns = [str(i) for i in range(0,42)] + columns
    for q in global_parameters['quantile']:
        quantile_columns.extend([str(int(q * 100)) + '_' + str(i) for i in temp_columns])
        
    
    result = pd.DataFrame(data=total, columns=
                          ['sum_' + str(i) for i in temp_columns] + 
                          ['std_' + str(i) for i in temp_columns] + 
                          ['max_' + str(i) for i in temp_columns] +
                          ['used_days'] +
                          quantile_columns +
                         
                         # ['weekday_sum_' + str(i) for i in temp_columns] + 
                          #['weekday_std_' + str(i) for i in temp_columns] + 
                          #['weekday_max_' + str(i) for i in temp_columns] +
                         
                          ['weekend_sum_' + str(i) for i in temp_columns] + 
                          ['weekend_std_' + str(i) for i in temp_columns] + 
                          ['weekend_max_' + str(i) for i in temp_columns])
    
    # 에러와 관련이 없는 부가 정보 입력
    
    # quality 데이터 연결
    result['user_id'] = range(start_user_id, start_user_id + size)
    for i in tqdm(range(0,len(quality_columns))):
        result.loc[:, quality_columns[i]] = result['user_id'].apply(lambda x: quality_user_id_statistics[x][i])
    
    user_model={}
    changed_to_model_2 = np.zeros(size)
    fwver_changed_count = np.zeros(size)
    old_ver_days = np.zeros(size)
    new_ver_days = np.zeros(size)
    for user_id, group in table.groupby(['user_id']):
        user_index = user_id - start_user_id
        user_model[user_id] = group.reset_index().loc[0, 'model_nm']
        
        fwver_changed_count[user_index] = len(group.loc[:, ['model_nm', 'fwver']].drop_duplicates()) - 1
            
        if len(group['model_nm'].unique()) == 1:
            # 구버전을 몇일동안 사용했는가?
            # 신버전을 몇일동안 사용했는가?
            days_fwnew = pd.concat([group['days'], group.apply(is_ner_ver, axis=1)], axis=1)
            old_ver_days[user_index] = days_fwnew.loc[days_fwnew[0] == False, 'days'].max()
            new_ver_days[user_index] = 33 - days_fwnew.loc[days_fwnew[0] == True, 'days'].min()
            continue
            
        if group.reset_index().loc[len(group) - 1, 'model_nm'] == 'model_2':
            changed_to_model_2[user_index] = 1
            
    where_are_NaNs = np.isnan(old_ver_days)
    old_ver_days[where_are_NaNs] = 0
    
    where_are_NaNs = np.isnan(new_ver_days)
    new_ver_days[where_are_NaNs] = 0
    
    result["changed_to_model_2"] = changed_to_model_2
    result["fwver_changed_count"] = fwver_changed_count
    
    result["old_ver_days"] = old_ver_days
    result["new_ver_days"] = new_ver_days
    
    user_model[43262] = 'model_1' # 예외 처리
    result["model_nm"] = result["user_id"].apply(lambda x : user_model[x]).astype('category')
    

    fwver_0311_days = np.zeros(size)
    fwver_0515_days = np.zeros(size)
    for n, group in table.groupby(['user_id','fwver_4']):
        user_index = n[0] - start_user_id
        if n[1] == "03.11":
            fwver_0311_days[user_index] = len(group['days'].unique())
        elif n[1] == "05.15":
            fwver_0515_days[user_index] = len(group['days'].unique())
    
    #result['fwver_0311_days'] = fwver_0311_days
    #result['fwver_0515_days'] = fwver_0515_days
    
    '''
    err_term = [30] * size
    for user_id, group in table.groupby('user_id'):
        days = group['days'].unique()
        days = list(days)
        days.append(1)
        days.append(30)
        days.append(31)
        days.append(32)
        days = list(set(days))
        days.sort()
        best = 0
        for i in range(0, len(days) - 1):
            diff = days[i+1] - days[i]
            if (best < diff):
                best = diff
        err_term[user_id - start_user_id] = best
    result['err_term'] = err_term
    '''
    del result['user_id'] # user_id는 학습에 사용되지 않아야하기 때문에 삭제
    return result

train_full = make_err_table(train_err, start_user_id=10000, size=15000)
test = make_err_table(test_err, start_user_id=30000, size=14999)

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:51<00:00, 319382.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 132/132 [05:28<00:00,  2.49s/it]
100%|██████████████████████████████████████████████████████████████████| 16532633/16532633 [00:52<00:00, 313400.04it/s]
100%|██████████████████████████████████████| 132/132 [05:39<00:00,  2.57s/it]


## 각 column의 샘플 개수에 따른 feature_selection

- 에러 타입과 에러 코드의 조합 중 사용되지 않는 조합이 있음.
- 또한 너무 적은 데이터가 있는 column은 삭제하여 오버피팅을 방지함

In [13]:
# 사용되지 않는 타입_코드 칼럼을 제거
#train_test = pd.concat([train_full, test], axis=0)
select_columns = []
for c in tqdm(train_full.columns):
    if str(train_full[c].dtype) == 'category':
        select_columns.append(c)
        continue
        
    if train_full[c].apply(lambda x: 1 if x > 0 else 0).sum() >= global_parameters['min_errcode_sample'] and test[c].apply(lambda x: 1 if x > 0 else 0).sum() >= global_parameters['min_errcode_sample']: # 트레이닝, 테스트 세트에 50개 이상의 행이 있어야함
        select_columns.append(c)

print(len(train_full.columns), '개의 칼럼을', len(select_columns), '개로 압축')
train_full = train_full[select_columns]
test = test[select_columns]
columns = select_columns

100%|█████████████████████████████████| 40458/40458 [02:32<00:00, 264.55it/s]


40458 개의 칼럼을 1431 개로 압축


In [14]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 
train_full['problem'] = problem

In [15]:
# train_full 전부 섞어주기
train_full = train_full.iloc[np.random.RandomState(seed=42).permutation(train_full.index)].reset_index(drop=True)
train_full = train_full.iloc[np.random.RandomState(seed=42).permutation(train_full.index)].reset_index(drop=True)

## pearson 연관관계 분석을 통한 feature_selection

In [16]:
if global_parameters['pearson_cutoff'] <= 1:
    org_train_full = train_full
    pearson_table = train_full.corr(method='pearson')
    pearson = []
    for i in train_full.columns:
        for j in train_full.columns:
            if (i >= j):
                continue
            if str(train_full[i].dtype) == 'category' or str(train_full[j].dtype) == 'category':
                continue
            pearson.append((i,j,pearson_table.loc[i,j]))
    pearson = sorted(pearson, key=lambda x: abs(x[2]), reverse=True)

    remove_columns = set()
    fix_columns = set()
    for i in pearson:
        # 이미 제거되었으면 넘기기
        if (i[0] in remove_columns or i[1] in remove_columns):
            continue
        if (abs(i[2]) >= global_parameters['pearson_cutoff']):
            remove_columns.add(i[1])

    print(len(train_full.columns), '개의 칼럼을', len(train_full.columns) - len(remove_columns), '개로 압축')
    train_full = train_full[train_full.columns.difference(list(remove_columns))]
    test = test[test.columns.difference(list(remove_columns))]


In [17]:
#train_full 데이터를 train, valid로 나눔 (valid는 점수 검증용으로)
def make_train_val(data, r=0.8):
    min_user_id = data.index.min()
    max_user_id = data.index.max()
    m = int((1 - r) * min_user_id + r * max_user_id)
    return data.loc[data.index < m], data.loc[data.index >= m]

train, valid = make_train_val(train_full)

In [18]:
def reset_bad_features():
    global bad_features
    bad_features = []

def add_bad_features(col):
    global bad_features
    bad_features.append(col)
    bad_features = list(set(bad_features))
    
reset_bad_features()

# 2. 모델 생성

- LGBM 모델을 사용하여 주어진 데이터를 학습함.
- 앙상블 학습을 통해 예측 성능 향상
- StratifiedKFold를 사용하여 valid set을 만들때 label 비율 유지
- training data에 대해서는 SMOTENC라는 oversampling 기법을 사용하여 데이터의 불균형을 해소
    - SMOTENC는 SMOTE에서 범주형 데이터를 사용할 수 있도록 수정된 알고리즘
- 학습에 충분한 시간이 있다면 global_parameters['lgbm_seed_ensemble']의 값을 늘려 seed의 다양성을 주어 성능을 향상시킬 수 있음

In [19]:
def reset_fold_cache():
    global fold_cache
    fold_cache = {}
reset_fold_cache()

In [20]:
# LGBM + Soft voting ensemble model (sklearn interface)
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from multiprocessing import Pool 
import workers

class LGBMEnsembleModel(object):
    def __init__(self, params, smote=True, folds=10, early_stopping_rounds=200, random_state=42):
        self.early_stopping_rounds = early_stopping_rounds
        self.params = params
        self.smote = smote
        self.folds = folds
        self.random_state = random_state
        self.models = []
        self.cv_scores = []
            
    def get_params(self, deep=True):
        return {'params': self.params, 'folds': self.folds, 'random_state': self.random_state, 'early_stopping_rounds': self.early_stopping_rounds}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def fit(self, X, y):
        self._default_params()
        self.models = []
        self.cv_scores = []
        
        datas = []
        original_seed = self.random_state
        for i in range(0, global_parameters['lgbm_seed_ensemble']):
            datas += self.load_fold_cache(X, y)
            self.random_state += 10000
        self.random_state = original_seed
        
        for i in datas:
            train_x = i['train_x']
            train_y = i['train_y']
            valid_x = i['valid_x']
            valid_y = i['valid_y']
            d_train = lgb.Dataset(train_x, train_y, silent=True, free_raw_data=False, params={'verbose': -1})
            d_val  = lgb.Dataset(valid_x, valid_y, silent=True, free_raw_data=False, params={'verbose': -1})
            clf = lgb.train(params=self.params, train_set=d_train, valid_sets=[d_train, d_val], verbose_eval=False, early_stopping_rounds=self.early_stopping_rounds)
            val_pred = clf.predict(valid_x)
            auc_score = roc_auc_score(valid_y, val_pred)
            self.cv_scores.append(auc_score)
            self.models.append(clf)
        return self
    
    def predict(self, X):
        y_pred = []
        for model in self.models:
            y_pred.append(model.predict(X))
            
        pred_ensemble = np.mean(y_pred, axis = 0)
        return pred_ensemble

    def score(self, X, y):
        val_pred = self.predict(X)
        auc_score = roc_auc_score(y, val_pred)
        return auc_score
    
    def _default_params(self):
        int_params = ['max_depth', 'num_leaves', 'num_iterations', 'min_data_in_leaf', 'max_bin', 'min_data_in_bin', 'n_estimators']
        for i in int_params:
            if i in self.params:
                self.params[i] = int(self.params[i])
            
        self.params['seed'] = 1015
        self.params['metric'] = 'auc'
        self.params['objective'] = 'binary'
        self.params['num_iterations'] = 1000000
        self.params['verbose'] = -1
        self.params['num_threads'] = 20
        
    def load_fold_cache(self, X, y):
        global fold_cache
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
            
        if isinstance(y, np.ndarray):
            y = pd.DataFrame(y)
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        hashcode = str(self.random_state) + str(X.shape) + str(self.folds) + str(X.columns) + str(X.iloc[0]) + pd.util.hash_pandas_object(X).astype(str).sum()
        if hashcode not in fold_cache:
            categorical_features = [X.columns.get_loc(i) for i in X.select_dtypes(exclude=['int', 'int64', 'float']).columns]
            output = []
            folds = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=self.random_state)
            if global_parameters['multiprocessing_for_smote']:
                task = []
                for train_idx, val_idx in folds.split(X, y):
                    train_x = X.loc[train_idx]
                    train_y = y.loc[train_idx]
                    valid_x = X.loc[val_idx]
                    valid_y = y.loc[val_idx]
                    if len(categorical_features) > 0:
                        smote = SMOTENC(random_state=self.random_state, categorical_features = categorical_features, n_jobs = 20)
                    else:
                        smote = SMOTE(random_state=self.random_state, n_jobs = 20)
                    task.append(({'train_x': train_x,'train_y': train_y, 'valid_x': valid_x, 'valid_y': valid_y}, smote))
                p=Pool(processes = 10) 
                output = p.map(workers.worker,task)
            else:
                for train_idx, val_idx in folds.split(X, y):
                    train_x = X.loc[train_idx]
                    train_y = y.loc[train_idx]
                    valid_x = X.loc[val_idx]
                    valid_y = y.loc[val_idx]
                    smote = SMOTENC(random_state=self.random_state, categorical_features = categorical_features, n_jobs = 20)
                    train_x, train_y = smote.fit_sample(train_x, train_y)
                    output.append({'train_x': train_x,'train_y': train_y, 'valid_x': valid_x, 'valid_y': valid_y})
            fold_cache[hashcode] = output
        return fold_cache[hashcode]

In [21]:
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance

def lgbm_train(data, params = {}, valid=valid):
    if 'boosting_list' in params:
        temp = params['boosting_list']
        for i in temp.keys():
            params[i] = temp[i]
        del(params['boosting_list'])

    x = data[data.columns.difference(['problem'] + bad_features)]
    y = data['problem']
    
    test_x = valid.loc[:, valid.columns.difference(['problem'] + bad_features)]
    test_y = valid['problem']
    clf = LGBMEnsembleModel(params, folds=global_parameters['nfold'])

    clf.fit(x,y)
    return {'model': clf, 'auc': clf.cv_scores, 'test auc': clf.score(test_x, test_y)}

# 3. 스코어 확인용 함수 생성

- 앙상블 러닝에서 이미 kfold를 사용하였기 때문에, CV를 한번 더 적용한 중첩 k-fold를 이용하여 모델 성능을 측정함

In [22]:
def global_score(params = {}, use_tqdm = True):
    scores = []
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if use_tqdm:
        for train_idx, val_idx in tqdm(folds.split(train_full, train_full['problem']), leave=False):
            scores.append(lgbm_train(train_full.loc[train_idx].reset_index(), params=params, valid=train_full.loc[val_idx].reset_index())['test auc'])
    else:
        for train_idx, val_idx in folds.split(train_full, train_full['problem']):
            scores.append(lgbm_train(train_full.loc[train_idx].reset_index(), params=params, valid=train_full.loc[val_idx].reset_index())['test auc'])
    return np.mean(scores)

In [23]:
# 기본 성능 테스트
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8514482210520059, 0.8327028557661319, 0.8509084821917465, 0.8298868269299966, 0.8446, 0.8323656249999999, 0.8194062499999999, 0.8329625, 0.8131421875, 0.8441290726817043]
deafult Test - AUC 0.8514631252554861


                        

global AUC 0.8430785999999999




# - 하이퍼 파라미터를 튜닝하지 않을 땐 바로 csv 생성 단계로 건너뛰기

# 4. (옵션) 파라미터 튜닝 함수

## PermutationImportance를 통한 bad_features 탐색

- 'model_nm'은 범주형 데이터라 임시로 제외됨
- PermutationImportance 를 통해 변수가 정확도에 얼마나 영향을 주는지 측정할 수 있음
- (-) 값인 경우 해당 columns의 data를 섞었을 때 더 정확도가 높게 나온 것임
- 이 정확도를 기반으로 logspace를 통해 적절한 score cutoff 지점을 찾음

- 결과 파일을 직접 생성하고 싶을 땐 아래 코드 실행

In [70]:
from eli5.sklearn import PermutationImportance
'''
clf = LGBMEnsembleModel(params = {}, folds=global_parameters['nfold'])
train_full_x = train_full[train_full.columns.difference(['problem', 'model_nm'] + bad_features)]
train_full_y = train_full['problem']
perm = PermutationImportance(clf, random_state = 42, n_iter = 10, cv=10).fit(train_full_x, train_full_y)
perm_df = pd.DataFrame()
perm_df['column'] = train_full_x.columns
perm_df['score'] = perm.feature_importances_
perm_df['score_std'] = perm.feature_importances_std_
perm_df.to_csv('perm_df.csv', index=False)
'''

In [95]:
perm_df = pd.read_csv('perm_df.csv')

In [198]:
eli5.show_weights(perm, top = 50, feature_names = train_full_x.columns.tolist())

Weight,Feature
0.0742  ± 0.0157,max_17
0.0141  ± 0.0101,sum_32_2
0.0097  ± 0.0073,max_22_terminate by peer user
0.0088  ± 0.0050,max_29
0.0069  ± 0.0067,10_32_2
0.0055  ± 0.0052,used_days
0.0031  ± 0.0035,max_23
0.0029  ± 0.0037,max_22_standby
0.0029  ± 0.0040,max_11
0.0024  ± 0.0038,max_15


In [None]:
cutoff_space = []
cutoff_result = []
# 만약 1000개의 col이 있으면 log를 통해 cut off 지점을 정함 (ex [1,2,3,5,8,10,25,66,150,450,899])
for i in np.logspace(0, np.log10(len(perm_df)), num=50, endpoint=False, base=10.0)[35:]:
    # i를 날림
    # cutoff 지점에 해당하는 score 값을 입력
    
    cutoff = perm_df.sort_values(by='score')['score'].values[int(i)]
    if last_cutoff == cutoff:
        continue
    else:
        last_cutoff = cutoff
    cutoff_space.append(cutoff)

last_cutoff = 0
for cutoff in tqdm(cutoff_space):
    remove_columns = perm_df.loc[perm_df['score'] < cutoff]['column'].values
    reset_bad_features()
    for col in remove_columns:
        add_bad_features(col)
    score = global_score()
    cutoff_result.append(score)
    print(cutoff, score)

In [193]:
for i in range(0, len(cutoff_result)):
    print(i, cutoff_result[i])

## 하이퍼 파라미터 검색
- hyperout 라이브러리를 사용


In [26]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
save_params = []
def _find_hyper_params(params):
    score = global_score(params, use_tqdm=False)
    save_params.append([score, params])
    return -score
def find_hyper_params(trials, max_evals=10):
    boosting_list = [
        {'boosting': 'gbdt'},
        {'boosting': 'goss', 'top_rate': hp.uniform('top_rate', 0, 0.5), 'other_rate': hp.uniform('other_rate', 0, 0.5)}
    ]

    search_space = {
            #'boosting_list' : hp.choice('boosting_list', boosting_list),
            'num_iterations':  hp.quniform("num_iterations", 100, 1000, 1),
            'max_depth':  hp.quniform("max_depth", 6, 20, 1),
            'num_leaves':  hp.quniform("num_leaves", 20, 150, 1),
            'max_bin': hp.quniform('max_bin', 64, 512, 1),

            'min_data_in_leaf': hp.quniform('min_data_in_leaf', 10, 1000, 1),
            #'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
            #'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),

            #'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
            #'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
            'n_estimators': hp.quniform('n_estimators', 500, 2000, 1),
            'feature_fraction' : hp.quniform('feature_fraction', 0.7, 1, 0.01),
            'bagging_fraction' : hp.quniform('bagging_fraction', 0.7, 1, 0.01),
            'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)), 
    }

    best=fmin(fn=_find_hyper_params, # function to optimize
              space=search_space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=max_evals, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.RandomState(42) # fixing random state for the reproducibility
             )
                               

    #best['boosting'] = boosting_list[best['boosting_list']]['boosting']
    #del(best['boosting_list'])
    return best

In [27]:
trials = Trials()

In [28]:

reset_bad_features()
n_iters = 200 # 이 횟수동안 파라미터를 변경하면서 최적의 값을 찾음
best_params = find_hyper_params(trials, n_iters)
r = lgbm_train(train, params=best_params)
print("hyperopt CV - AUC", r['auc'])
print("hyperopt Test - AUC", r['test auc'])
print("hyperopt global score - AUC", global_score(best_params))

 12%|████▍                                  | 23/200 [7:13:49<55:38:34, 1131.72s/trial, best loss: -0.8442474000000001]


KeyboardInterrupt: 

In [35]:
sorted(save_params, key=lambda x: x[0], reverse=True)[0]

[0.8442474000000001,
 {'bagging_fraction': 0.77,
  'feature_fraction': 0.87,
  'learning_rate': 0.020031706952922917,
  'max_bin': 280,
  'max_depth': 15,
  'min_data_in_leaf': 113,
  'n_estimators': 1917,
  'num_iterations': 1000000,
  'num_leaves': 55,
  'seed': 1015,
  'metric': 'auc',
  'objective': 'binary',
  'verbose': -1,
  'num_threads': 20}]

# 5. 제출용 CSV 생성
- 사용 가능한 모든 training data를 사용함

In [173]:
'''
remove_columns = perm_df.loc[perm_df['score'] < cutoff_space[11]]['column'].values
reset_bad_features()
for col in remove_columns:
    add_bad_features(col)
'''

In [40]:
sorted(save_params, key=lambda x: x[0], reverse=True)

[[0.8442474000000001,
  {'bagging_fraction': 0.77,
   'feature_fraction': 0.87,
   'learning_rate': 0.020031706952922917,
   'max_bin': 280,
   'max_depth': 15,
   'min_data_in_leaf': 113,
   'n_estimators': 1917,
   'num_iterations': 1000000,
   'num_leaves': 55,
   'seed': 1015,
   'metric': 'auc',
   'objective': 'binary',
   'verbose': -1,
   'num_threads': 20}],
 [0.8441341999999998,
  {'bagging_fraction': 0.76,
   'feature_fraction': 0.8200000000000001,
   'learning_rate': 0.03934802056011543,
   'max_bin': 380,
   'max_depth': 19,
   'min_data_in_leaf': 31,
   'n_estimators': 1760,
   'num_iterations': 1000000,
   'num_leaves': 72,
   'seed': 1015,
   'metric': 'auc',
   'objective': 'binary',
   'verbose': -1,
   'num_threads': 20}],
 [0.844057,
  {'bagging_fraction': 0.79,
   'feature_fraction': 0.92,
   'learning_rate': 0.0286135796693977,
   'max_bin': 250,
   'max_depth': 15,
   'min_data_in_leaf': 323,
   'n_estimators': 1899,
   'num_iterations': 1000000,
   'num_leaves':

In [39]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
submit_model = lgbm_train(train_full, params=sorted(save_params, key=lambda x: x[0], reverse=True)[0][1])['model']

sample_submssion['problem'] = submit_model.predict(test[test.columns.difference(['problem'] + bad_features)])
sample_submssion.to_csv("LGBM.csv", index = False)