In [1]:
global_parameters = {
    # 날짜별 에러 정보를 유저 아이디로 취합할 때 사용할 quantile 값
    'quantile': [0.10, 0.25, 0.35, 0.75, 0.80, 0.90],
    
    # LGBM 앙상블 학습시 사용할 fold 개수
    'nfold': 10,
    
    # STMOE 알고리즘에 대해 멀티 프로세싱을 사용할지 여부, (workers.py 파일이 존재해야함)
    'multiprocessing_for_smote': True,
    
    # 데이터 세트에 x개 이상 정보가 에러타입_코드 칼럼만 사용
    'min_errcode_sample': 75,
    
    # pearson 상관계수를 이용하여 x 값 이상인 칼럼을 제거함 (1을 초과할 경우 작동하지 않음)
    'pearson_cutoff': 2,
    
    # lgbm에서 seed를 변경하여 앙상블 모델의 다양성 추가 (기본값1)
    'lgbm_seed_ensemble': 1,
}

# 1. 데이터 로드 및 전처리

In [2]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
import collections

warnings.filterwarnings(action='ignore')

PATH = '../data/'
def make_days(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    return (dt.date(year, month, day) - dt.date(2020, 10, 31)).days

def make_hours(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    return int(x[8:10])

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^\-0-9\.]+", '', str(x))
    if x =='':
        return -1
    else:
        return int(float(x))

## 데이터 불러오기

In [3]:
train_err = pd.read_csv(PATH+'train_err_data.csv')
train_err['days'] = train_err['time'].apply(make_days)
train_err['hours'] = train_err['time'].apply(make_hours)
train_err['errcode'] = train_err['errcode'].astype(str)

test_err = pd.read_csv(PATH+'test_err_data.csv')
test_err['days'] = test_err['time'].apply(make_days)
test_err['errcode'] = test_err['errcode'].astype(str)
test_err['hours'] = test_err['time'].apply(make_hours)

In [4]:
# 문자열(ex "1,000")로 되어있는 퀄리티 데이터 float 타입으로 변환
train_quality = pd.read_csv(PATH+'train_quality_data.csv')
for i in range(0,13,1):
    train_quality['quality_' + str(i)] = train_quality['quality_' + str(i)].apply(lambda x: string2num(x)).astype(float)
train_quality['days'] = train_quality['time'].apply(make_days)

test_quality = pd.read_csv(PATH+'test_quality_data.csv')
for i in range(0,13,1):
    test_quality['quality_' + str(i)] = test_quality['quality_' + str(i)].apply(lambda x: string2num(x)).astype(float)
test_quality['days'] = test_quality['time'].apply(make_days)

## 퀄리티 데이터로부터 feature 생성

- 퀄리티 정보를 [유저 아이디, 날짜]로 그룹하고 median을 이용해 대표값만 남김
    - shape: (user_id, days, quality_columns)
- 해당 데이터를 다시 유저 아이디별로 통계를 만듦.
    - quality_count: 퀄리티 로그가 발생한 날짜 수
    - quality_sum: 퀄리티 로그의 합 (상황에 따라 sum/count를 이용하여 평균값을 구할 수 있음)
    - quality_50: 퀄리티 로그의 median 값

In [5]:
# [user_id, group]
quality_user_id = {}
for i in range(10000,45000):
    quality_user_id[i] = []
    for j in range(0,13):
        quality_user_id[i].append([])
        
for i, group in train_quality[train_quality.columns.difference(['time'])].groupby(['user_id']):
    for j in range(0,13):
        quality_user_id[i][j] = group.loc[group['quality_'+str(j)] != -1]['quality_'+str(j)].values
        
for i, group in test_quality[test_quality.columns.difference(['time'])].groupby(['user_id']):
    for j in range(0,13):
        quality_user_id[i][j] = group.loc[group['quality_'+str(j)] != -1]['quality_'+str(j)].values
        
quality_columns_preset =  ['quality_count'] + ['quality_sum'] + ['quality_mean']  + ['quality_var'] + ['quality_max'] + ['quality_std/mean']
quality_user_id_statistics = {}
for i in quality_user_id:
    quality_user_id_statistics[i] = []
    for j in range(0,13):
        a = np.array(quality_user_id[i][j])
        if (len(a) == 0):
            quality_user_id_statistics[i] += [0] + [-999] * (len(quality_columns_preset) - 1)
        else: 
            quality_user_id_statistics[i] += [len(a), a.sum(axis=0), a.mean(axis=0), a.var(axis=0), a.max(axis=0), a.std(axis=0) / a.mean(axis=0)]
quality_columns = []
quality_columns_index = {}
for i in range(0,13):
    for j in quality_columns_preset:
        # 어차피 이미 만들어진 퀄리티 데이터에 접근하지 않으니 index가 필요없음
        # quality_columns_index[column_name] = len(quality_columns)
        quality_columns.append(j + '_' + str(i))

In [6]:
quality_list = {
    '05.15.2138': [['1', 'std'], ['5', 'mean'], ['5', 'max'], 
                   ['6', 'mean'], ['6', 'std'], ['7', 'mean'],
                   ['7', 'max'], ['7', 'std'], ['10', 'min'],
                   ['10', 'mean'], ['10', 'std']],
    
    '04.22.1750': [['1', 'max'], ['1', 'std'], ['5', 'mean'], 
                   ['5', 'max'], ['5', 'std'], ['6', 'mean'],
                   ['7', 'min'], ['7', 'mean'], ['8', 'std'],
                   ['10', 'mean'], ['10', 'max'], ['10', 'std']],
                  
    
    '04.33.1261': [['7', 'max'], ['7', 'std'], ['10', 'min'], 
                   ['10', 'mean'], ['10', 'max'], ['12', 'std']],
    
    '04.16.3553': [['1', 'max'], ['5', 'max'], ['6', 'mean'], 
                   ['7', 'mean'], ['7', 'std'], ['8', 'max'],
                  ['10', 'max'], ['10', 'std'], ['12', 'max']],
                   
    '03.11.1167': [['1', 'max'], ['5', 'mean'], ['5', 'std'], 
                   ['7', 'min'], ['8', 'min'], ['8', 'max'],
                  ['8', 'std'], ['10', 'mean'], ['11', 'std']],
                  
    '04.33.1185': [['5', 'mean'], ['5', 'max'], ['5', 'std'], 
                   ['7', 'std'], ['8', 'std'], ['10', 'min'],
                  ['10', 'mean'], ['10', 'max'], ['10', 'std'],
                  ['11', 'max'], ['12', 'mean'], ['12', 'std']],
    
    
    '04.22.1778': [['6', 'max'], ['7', 'max'], ['7', 'std'], 
                   ['10', 'max'], ['11', 'max'], ['12', 'mean'],
                  ['12', 'max'], ['12', 'std']]
}

quality_list = {
    '05.15.2138': [['0', 'min'], ['1', 'max'], ['7', 'mean'], 
                   ['7', 'max'], ['7', 'cv'], ['8', 'max'],
                   ['8', 'cv']],
    
    '04.22.1750': [['0', 'mean'], ['1', 'var'], ['1', 'cv'], 
                   ['6', 'max'], ['7', 'mean'], ['7', 'cv'],
                   ['10', 'min'], ['11', 'mean']],
                  
    
    '04.33.1261': [['0', 'std'], ['1', 'std'], ['2', 'std'], 
                   ['6', 'max'], ['8', 'max'], ['8', 'cv']],
    
    '04.16.3553': [['1', 'max'], ['5', 'mean'], ['7', 'mean'], 
                   ['10', 'max'], ['10', 'var'], ['12', 'max']],
                   
    '03.11.1167': [['1', 'min'], ['5', 'max'], ['6', 'std'], 
                   ['7', 'mean'], ['7', 'cv'], ['8', 'max'],
                  ['8', 'cv'], ['9', 'mean'], ['10', 'min'],
                  ['10', 'max'], ['11', 'mean'], ['11', 'std'],
                  ['11', 'var']],
                  
    '04.33.1185': [['5', 'mean'], ['5', 'std'], ['5', 'var'], 
                   ['6', 'std'], ['6', 'var'], ['7', 'cv'],
                  ['8', 'max'], ['8', 'cv'], ['10', 'mean'],
                  ['10', 'std'], ['10', 'var'], ['12', 'std']],
    
    
    '04.22.1778': [['5', 'max'], ['7', 'cv'], ['10', 'std']]
}

'''
for fwver in list(quality_list.keys()):
    quality_list[fwver] = []
    for i in range(0,13):
        for j in ['min', 'mean', 'max', 'std']:
            quality_list[fwver].append([str(i),j])

quality_columns = []
quality_columns_index = {}
'''
old = len(quality_columns)
for fwver in quality_list.keys():
    for i in quality_list[fwver]:
        column_name = fwver + '_quality_' + i[0] + '_' + i[1]
        quality_columns_index[column_name] = len(quality_columns)
        quality_columns.append(column_name)
        
# quality_user_id_statistics = {}
for i in range(10000,45000):
    quality_user_id_statistics[i] += [-999] * (len(quality_columns) - old)

for table in [train_quality, test_quality]:
    for fwver in quality_list.keys():
        data = table.loc[train_quality['fwver'] == fwver]
        for user_id, group in data.groupby('user_id'):
            for i in quality_list[fwver]:
                column_name = fwver + '_quality_' + i[0] + '_' + i[1]
                quality_name = 'quality_' + i[0]
                statistics = i[1]
                value = 0
                if statistics == 'min':
                    value = group[quality_name].min()
                if statistics == 'mean':
                    value = group[quality_name].mean()
                if statistics == 'max':
                    value = group[quality_name].max()
                if statistics == 'std':
                    value = group[quality_name].std()
                if statistics == 'var':
                    value = group[quality_name].var()
                if statistics == 'cv':
                    value = group[quality_name].std()
                    if group[quality_name].mean() != 0:
                        value /= group[quality_name].mean()
                quality_user_id_statistics[user_id][quality_columns_index[column_name]] = value
                
                quality_user_id_statistics[user_id][quality_columns_index[column_name]] = value


## 에러 데이터로부터 feature 생성
- 에러 타입과 에러 코드를 조합하여 칼럼을 생성하였음. A_B = 에러(타입 A 이면서 코드는 B)
- **날짜별로** 에러 발생 횟수를 카운트 -> (user_id, days, errtype_code)
- **유저별로 에러 발생 횟수에 대한 통계를 적용 -> (user_id, days, statistics_errtype_code)**
    - sum_A_B: 데이터 수집 기간동안 A_B가 발생한 횟수 
    - std_A_B: 데이터 수집 기간동안 A_B가 발생한 횟수의 표준 편차 (값이 높을수록 에러 발생 빈도가 급격하게 변화함)
    - max_A_B: 데이터 수집 시간중 A_B가 가장 많이 발생했던 날의 에러 수.
    - quantileX_A_B: 데이터 수집 시간중 A_B가 X일 이상 최소 value만큼 등장함
        - quantile10_A_B의 값이 2인 경우 A_B 에러가 2회 이상 발생한 경우가 30일 이상임: 모든 날에 걸처 에러가 지속적으로 나타남을 의미함
        - quantile90_A_B의 값이 17인 경우 A_B 에러가 17회 이상 발생한 경우가 3일 이상임: 에러 발생이 순간적으로 많아진 날들에 대해 얼마나 많아졌었는지를 의미함
    - used_days: 에러가 한건이라도 있는 날은 디바이스를 사용한 날이라고 판단하여, 총 몇일동안 디바이스를 사용하였는지 추측
    - model_nm(categorical columns): 해당 유저가 자주 사용한 디바이스
        
- 일부 에러는 오후 11시 ~ 오전 1시처럼 두 날에 걸쳐서 발생하는 경우도 있음. 단순히 날짜별 통계로는 이러한 케이스를 구분할 수 없기때문에 이틀씩 데이터를 연결시킴
    - 1일 데이터 + 2일 데이터
    - 2일 데이터 + 3일 데이터
    - 3일 데이터 + 4일 데이터
    ...

In [7]:
# 33일 넘기는 테스트 데이터 제거
test_err = test_err.loc[test_err['days'] < 33]

In [8]:
# white_list = {(25, 'scanning timeout'), (8, 'PHONE_ERR'), (36, '8.0'), (5, 'Q-64002'), (19, '1'), (30, '1'), (25, 'connectionterminated by local host'), (21, '1'), (1, 'P-41007'), (25, '2'), (5, 'Q-64001'), (4, '0'), (37, '0'), (25, '1'), (25, 'UNKNOWN'), (18, '1'), (17, '21'), (1, '0'), (20, '1'), (1, 'P-44010'), (5, 'U-82026'), (3, '0'), (23, 'terminate by peer user')}
white_list_code = {'Q-64001', 'S-65002', '21', 'Q-64002', 'PHONE_ERR', '3', 'scanning timeout', 'NFANDROID2', '2', 'U-82026', 'P-41007', 'V-21008', 'P-44010', '8.0', 'terminate by peer user'}

In [9]:
test_type_code = set()
for i, group in test_err.groupby(['errtype', 'errcode']):
    if len(group['user_id'].unique()) >= global_parameters['min_errcode_sample']:
        test_type_code.add((i[0], i[1]))

same_type_code = []
for i, group in train_err.groupby(['errtype', 'errcode']):
    if len(group['user_id'].unique()) >= global_parameters['min_errcode_sample'] and (i[0], i[1]) in test_type_code:
        same_type_code.append((i[0],i[1]))

In [10]:
# error_type, error_code 매핑
def key_typecode(t,c = None):
    if c == None:
        return "errcode_" + str(t).strip()
    
    return str(t).strip() + '_' + str(c).strip()

columns = []
column_index = {}
count = 0
for i in same_type_code:
    key = key_typecode(i[0], i[1])
    if key not in columns:
        columns.append(key)
        column_index[key] = count
        count += 1
'''        
for i in white_list_code:
    key = key_typecode(i)
    if key not in columns:
        columns.append(key)
        column_index[key] = count
        count += 1
        '''
            
print("검색된 칼럼 수", len(columns))

검색된 칼럼 수 99


In [11]:
def _ver(x): #04.30.1235
    x = x.split('.')
    if (len(x) != 3):
        return int(x[0])
    return int(x[2])

new_ver_list = {
    'model_0': 1778,
    'model_1': 3571,
    'model_2': 1261,
    'model_3': 2138,
    'model_4': 1167,
    'model_5': 1778,
    'model_6': 10,
    'model_7': 3571,
    'model_8': 2571,
}

def is_ner_ver(x):
    cutoff = new_ver_list[x['model_nm']]
    ver = _ver(x['fwver'])
    return ver >= cutoff

train_err['is_ner_ver'] = train_err.apply(is_ner_ver, axis=1)
test_err['is_ner_ver'] = test_err.apply(is_ner_ver, axis=1)

In [12]:
def timesplit(x):
    if x >= 6 and x < 18:
        return '낮 (오전 6시~)'
    else:
        return '밤 (오후 6시~)'
    
train_err['timesplit2'] = train_err['hours'].apply(timesplit)
test_err['timesplit2'] = test_err['hours'].apply(timesplit)
def timesplit(x):
    if x >= 9 and x < 15:
        return '점심'
    elif x >= 15 and x < 21:
        return '저녁'
    elif x >= 21 and x < 3:
        return '새벽'
    else:
        return '아침'

train_err['timesplit4'] = train_err['hours'].apply(timesplit)
test_err['timesplit4'] = test_err['hours'].apply(timesplit)

In [13]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 

In [14]:
fw_flow = []
fwver_used_id = {}
table = {}
for i, group in train_err.groupby('user_id'):
    key = list(group['fwver'].drop_duplicates().values)
    for j in range(len(key), 5):
        key.append('not updated')
    key = tuple(key) 
    if key not in table:
        table[key] = [0, 0, 0]
    if problem[i-10000]:
        table[key][0] += 1
    else:
        table[key][1] += 1
    
    fwver_used_id[i] = key
    
for i, group in test_err.groupby('user_id'):
    key = list(group['fwver'].drop_duplicates().values)
    for j in range(len(key), 5):
        key.append('not updated')
    key = tuple(key) 
    
    if key not in table:
        table[key] = [0, 0, 0]
        
    if key in table:
        table[key][2] += 1
    
    fwver_used_id[i] = key
    
for i in table:
    p = 0
    if table[i][0]+table[i][1] != 0:
        p = table[i][0] / (table[i][0]+table[i][1])
        
    row = [] + list(i) + [table[i][0], table[i][1], table[i][0]+ table[i][1], table[i][2], p]
    fw_flow.append(row)
fw_flow = pd.DataFrame(fw_flow, columns=[str(i) for i in range(0,5)] + ['p', 'np', 'trainc', 'testc', 'problem'])
fw_flow = fw_flow.sort_values('trainc', ascending=False).reset_index(drop=True)

In [15]:
fw_flow['to'] = fw_flow.index
fw_flow.loc[12, 'to'] = 10
fw_flow.loc[fw_flow['to'] > 10, 'to'] = 11
fw_flow
fw_key = {}
for i, row in fw_flow.iterrows():
    key = tuple(row[['0','1','2','3','4']])
    fw_key[key] = row['to']   
for i in fwver_used_id:
    fwver_used_id[i] = fw_key[fwver_used_id[i]]

In [17]:
def make_err_table(table, start_user_id, size):    
    id_error = table[['user_id','errtype','errcode', 'days', 'timesplit2', 'timesplit4']].values
    error = np.zeros((size, 33, 42 + len(columns)))    
    weekend_error = np.zeros((size, 10, 42 + len(columns))) 
    custom_columns = []# ['30(밤)', '18(저녁)', '25(점심)']
    custom_error = np.zeros((size, 33, len(custom_columns)))   
    
    day_error = np.zeros((size, 33, 42 ))
    night_error = np.zeros((size, 33,  42 ))
    
    mapping_index = {}
    for i in range(0,33):
        if (i % 7 <= 1):
            mapping_index[i] = int(i / 7) * 2 + i % 7
        else:
            mapping_index[i] = -1
            
    days = np.zeros((size, 33))
    for person_idx, err, code, day, timesplit2, timesplit4 in tqdm(id_error):
        user_index = person_idx - start_user_id
        '''
        if err == 30 and timesplit2 == '밤 (오후 6시~)':
            custom_error[user_index, day, 0] += 1
        
        if err == 18 and timesplit4 == '저녁':
            custom_error[user_index, day, 1] += 1
            
        if err == 25 and timesplit4 == '점심':
            custom_error[user_index, day, 2] += 1
            '''
        if timesplit2 == '낮 (오전 6시~)':
            day_error [user_index, day, err-1]+=1
        elif timesplit2 == '밤 (오후 6시~)':
            night_error[user_index, day, err-1] += 1
            
        # person_idx - 10000 위치에 person_idx, errtype에 해당하는 error값을 +1
        error[user_index, day, err - 1] += 1
        if mapping_index[day] != -1:
            weekend_error[user_index, mapping_index[day], err - 1] += 1
        days[user_index, day] = 1
        key = key_typecode(err, code)
        if key in column_index:
            error[user_index, day, 42 + column_index[key]] += 1
            if mapping_index[day] != -1:
                weekend_error[user_index, mapping_index[day], 42 + column_index[key]] += 1
                
           
            
        '''
        # 에러 코드별 합계를 추가하고싶으면 아래 주석 해제
        key = key_typecode(code)
        if key in column_index:
            error[person_idx - start_user_id, day, 42 + column_index[key]] += 1
        '''
       
    error_mix = np.zeros((size, 32, 42 + len(columns)))    
    for i in range(0,size):
        for j in range(32): #0~1, 1~2 .... 65~66
            error_mix[i][j] = np.sum([error[i][j], error[i][j+1]], axis=0)

    error_sum = np.sum(error_mix, axis=1)
    error_std = np.std(error_mix, axis=1)# / np.mean(error_mix, axis=1)
    error_max = np.max(error_mix, axis=1)
    error_var = np.var(error_mix, axis=1)
    error_cv = np.std(error_mix, axis=1) / np.mean(error_mix, axis=1)
    
    error_cv[np.isnan(error_cv)] = 0
    
    quantile_arr = np.quantile(error_mix, global_parameters['quantile'], axis=1)
    sum_days = np.sum(days, axis=1).reshape(-1,1)
    
    # weekday_error_sum = np.sum(weekday_error, axis=1)
    # weekday_error_std = np.std(weekday_error, axis=1)
    # weekday_error_max = np.max(weekday_error, axis=1)
    
    weekend_error_sum = np.sum(weekend_error, axis=1)
    weekend_error_std = np.std(weekend_error, axis=1)
    weekend_error_max = np.max(weekend_error, axis=1)
    weekend_error_var = np.var(weekend_error, axis=1)
    weekend_error_cv = np.std(weekend_error, axis=1) / np.mean(weekend_error, axis=1)
    weekend_error_cv[np.isnan(weekend_error_cv)] = 0
    
    custom_error_sum = np.sum(custom_error, axis=1)
    custom_error_std = np.std(custom_error, axis=1)
    custom_error_max = np.max(custom_error, axis=1)
    
    day_error_sum = np.sum(day_error, axis=1)
    day_error_cv = np.std(day_error, axis=1) / np.mean(day_error, axis=1)
    day_error_max = np.max(day_error, axis=1)
    day_error_var = np.var(day_error, axis=1)
    day_error_cv[np.isnan(day_error_cv)] = 0
    
    night_error_sum = np.sum(night_error, axis=1)
    night_error_cv = np.std(night_error, axis=1) / np.mean(night_error, axis=1)
    night_error_max = np.max(night_error, axis=1)
    night_error_var = np.var(night_error, axis=1)
    night_error_cv[np.isnan(night_error_cv)] = 0
    
    
    
    total = np.concatenate([error_sum,  error_max, error_var, error_cv, sum_days] + [i for i in quantile_arr] + 
                           #[weekday_error_sum, weekday_error_std, weekday_error_max] +
                           [weekend_error_sum,  weekend_error_max, weekend_error_var, weekend_error_cv] +
                           [day_error_sum,  day_error_max, day_error_var, day_error_cv] +
                           [night_error_sum,  night_error_max, night_error_var, night_error_cv] +
                           [custom_error_sum, custom_error_std, custom_error_max]
                           , axis=1)
    quantile_columns = []
    temp_columns = [str(i) for i in range(1,43)] + columns
    for q in global_parameters['quantile']:
        quantile_columns.extend([str(int(q * 100)) + '_' + str(i) for i in temp_columns])
        
    
    result = pd.DataFrame(data=total, columns=
                          ['sum_' + str(i) for i in temp_columns] + 
                         
                          ['max_' + str(i) for i in temp_columns] +
                          ['var_' + str(i) for i in temp_columns] + 
                          ['cv_' + str(i) for i in temp_columns] + 
                          ['used_days'] +
                          quantile_columns +
                         
                         # ['weekday_sum_' + str(i) for i in temp_columns] + 
                          #['weekday_std_' + str(i) for i in temp_columns] + 
                          #['weekday_max_' + str(i) for i in temp_columns] +
                         
                          ['weekend_sum_' + str(i) for i in temp_columns] + 
                     
                          ['weekend_max_' + str(i) for i in temp_columns] + 
                          ['weekend_var_' + str(i) for i in temp_columns] + 
                          ['weekend_cv_' + str(i) for i in temp_columns] + 
                          
                          ['day_sum_' + str(i) for i in range(1,43)] + 
                          ['day_max_' + str(i) for i in range(1,43)] + 
                          ['day_var_' + str(i) for i in range(1,43)] + 
                          ['day_cv_' + str(i) for i in range(1,43)] + 
                          
                          ['night_sum_' + str(i) for i in range(1,43)] + 
                          ['night_max_' + str(i) for i in range(1,43)] + 
                          ['night_var_' + str(i) for i in range(1,43)] + 
                          ['night_cv_' + str(i) for i in range(1,43)] + 
                          
                          ['custom_sum_' + str(i) for i in custom_columns] + 
                          ['custom_std_' + str(i) for i in custom_columns] + 
                          ['custom_max_' + str(i) for i in custom_columns])
    
    # 에러와 관련이 없는 부가 정보 입력
    
    # quality 데이터 연결
    result['user_id'] = range(start_user_id, start_user_id + size)
    for i in tqdm(range(0,len(quality_columns))):
        result[quality_columns[i]] = result['user_id'].apply(lambda x: quality_user_id_statistics[x][i])
    
    user_model={}
    changed_to_model_2 = np.zeros(size)
    fwver_changed_count = np.zeros(size)
    old_ver_days = np.zeros(size)
    new_ver_days = np.zeros(size)
    for user_id, group in table.groupby(['user_id']):
        user_index = user_id - start_user_id
        user_model[user_id] = group.iloc[0]['model_nm']
        
        fwver_changed_count[user_index] = len(group[['model_nm', 'fwver']].drop_duplicates()) - 1
            
        if len(group['model_nm'].unique()) == 1:
            # 구버전을 몇일동안 사용했는가?
            # 신버전을 몇일동안 사용했는가?
            old_ver_days[user_index] = group.loc[group['is_ner_ver'] == False, 'days'].max()
            new_ver_days[user_index] = 33 - group.loc[group['is_ner_ver'] == True, 'days'].min()
            continue
            
        if group.iloc[len(group) - 1]['model_nm'] == 'model_2':
            changed_to_model_2[user_index] = 1
            
    where_are_NaNs = np.isnan(old_ver_days)
    old_ver_days[where_are_NaNs] = 0
    
    where_are_NaNs = np.isnan(new_ver_days)
    new_ver_days[where_are_NaNs] = 0
    
    result["changed_to_model_2"] = changed_to_model_2
    result["fwver_changed_count"] = fwver_changed_count
    
    result["old_ver_days"] = old_ver_days
    result["new_ver_days"] = new_ver_days
    
    user_model[43262] = 'model_1' # 예외 처리
    result["model_nm"] = result["user_id"].apply(lambda x : user_model[x]).astype('category')
    
    fwver_used_id[43262] = 11 # 예외 처리
    
    # result["fw_mean"] = result["user_id"].apply(lambda x : np.array([int(i.replace('.', '')) for i in fwver_used_id[x]]).mean())
    # result["fw_std"] = result["user_id"].apply(lambda x : np.array([int(i.replace('.', '')) for i in fwver_used_id[x]]).std())
    #for i in range(0,5):
    #    result["fwc" + str(i)] = result["user_id"].apply(lambda x : fwver_used_id[x][i][0]).astype('category')
    #    result["fwt" + str(i)] = result["user_id"].apply(lambda x : fwver_used_id[x][i][1])
    # result["fw_flow"] = result["user_id"].apply(lambda x : fwver_used_id[x]).astype('category')
    '''
    fwver_0311_days = np.zeros(size)
    fwver_0515_days = np.zeros(size)
    for n, group in table.groupby(['user_id','fwver_4']):
        user_index = n[0] - start_user_id
        if n[1] == "03.11":
            fwver_0311_days[user_index] = len(group['days'].unique())
        elif n[1] == "05.15":
            fwver_0515_days[user_index] = len(group['days'].unique())
    
    result['fwver_0311_days'] = fwver_0311_days
    result['fwver_0515_days'] = fwver_0515_days
    '''
    
    '''
    err_term = [30] * size
    for user_id, group in table.groupby('user_id'):
        days = group['days'].unique()
        days = list(days)
        days.append(1)
        days.append(30)
        days.append(31)
        days.append(32)
        days = list(set(days))
        days.sort()
        best = 0
        for i in range(0, len(days) - 1):
            diff = days[i+1] - days[i]
            if (best < diff):
                best = diff
        err_term[user_id - start_user_id] = best
    result['err_term'] = err_term
    '''
    del result['user_id'] # user_id는 학습에 사용되지 않아야하기 때문에 삭제
    return result

train_full = make_err_table(train_err, start_user_id=10000, size=15000)
test = make_err_table(test_err, start_user_id=30000, size=14999)

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [01:01<00:00, 268830.44it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 133/133 [00:00<00:00, 157.83it/s]
100%|██████████████████████████████████████████████████████████████████| 16532633/16532633 [01:01<00:00, 269162.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 133/133 [00:01<00:00, 125.48it/s]


## 각 column의 샘플 개수에 따른 feature_selection

- 에러 타입과 에러 코드의 조합 중 사용되지 않는 조합이 있음.
- 또한 너무 적은 데이터가 있는 column은 삭제하여 오버피팅을 방지함

In [18]:
for i in range(13):
    if "quality_std/mean_"+str(i) in train_full.columns:
        train_full["quality_std/mean_"+str(i)] = train_full["quality_std/mean_"+str(i)].fillna(0)
for i in range(13):
    if "quality_std/mean_"+str(i) in test.columns:
        test["quality_std/mean_"+str(i)] = test["quality_std/mean_"+str(i)].fillna(0)
    

In [19]:
# 사용되지 않는 타입_코드 칼럼을 제거
#train_test = pd.concat([train_full, test], axis=0)
select_columns = []
for c in tqdm(train_full.columns):
    if str(train_full[c].dtype) == 'category':
        select_columns.append(c)
        continue
    if train_full[c].apply(lambda x: 1 if x > 0 else 0).sum() >= global_parameters['min_errcode_sample'] and test[c].apply(lambda x: 1 if x > 0 else 0).sum() >= global_parameters['min_errcode_sample']: # 트레이닝, 테스트 세트에 50개 이상의 행이 있어야함
        select_columns.append(c)
    else:
        if 'custom_' in c:
            select_columns.append(c)
        '''
        # 코드 합계는 모두 통과 (화이트리스트)
        if 'sum_' in c or 'max_' in c or 'std_' in c:
            if 'errcode_' in c:
                select_columns.append(c)
                
        '''
        '''
        if 'sum_' in c or 'max_' in c or 'std_' in c:
            original_columns = c[4:].split('_')
            if len(original_columns) == 2 and original_columns[0] != 'errcode' and (int(original_columns[0]), original_columns[1]) in white_list:
                if 'sum_' in c:
                    print(c)
                select_columns.append(c)
        '''

print(len(train_full.columns), '개의 칼럼을', len(select_columns), '개로 압축')
#train_full = train_full[select_columns]
#test = test[select_columns]
#columns = select_columns

100%|█████████████████████████████████████████████████████████████████████████████| 2449/2449 [00:14<00:00, 164.64it/s]

2449 개의 칼럼을 1943 개로 압축





In [20]:
train_full = train_full[select_columns]
test = test[select_columns]
# columns = select_columns

In [21]:
train_full['problem'] = problem

In [22]:
# train_full 전부 섞어주기
train_full = train_full.iloc[np.random.RandomState(seed=42).permutation(train_full.index)].reset_index(drop=True)
train_full = train_full.iloc[np.random.RandomState(seed=42).permutation(train_full.index)].reset_index(drop=True)

In [23]:
train_full

Unnamed: 0,sum_1,sum_2,sum_3,sum_4,sum_5,sum_6,sum_7,sum_10,sum_11,sum_12,sum_13,sum_14,sum_15,sum_16,sum_17,sum_18,sum_19,sum_20,sum_21,sum_22,sum_23,sum_24,sum_25,sum_26,sum_27,sum_28,sum_30,sum_31,sum_32,sum_33,sum_34,sum_35,sum_36,sum_37,sum_38,sum_39,sum_40,sum_41,sum_42,sum_1_0,sum_1_P-41007,sum_1_P-44010,sum_2_0,sum_2_1,sum_3_1,sum_3_2,sum_4_0,sum_4_1,sum_5_700001,sum_5_B-A8002,sum_5_C-11017,sum_5_H-51042,sum_5_H-51046,sum_5_Q-64002,sum_5_S-61001,sum_5_S-64002,sum_5_S-65002,sum_5_V-21008,sum_6_1,sum_6_14,sum_7_1,sum_7_14,sum_10_1,sum_11_1,sum_12_1,sum_13_1,sum_14_1,sum_14_13,sum_14_14,sum_15_1,sum_16_1,sum_17_1,sum_17_13,sum_17_14,sum_17_21,sum_18_1,sum_19_1,sum_20_1,sum_21_1,sum_22_1,sum_23_UNKNOWN,sum_23_active,sum_23_connection fail for LMP response timout,sum_23_connection fail to establish,sum_23_connection timeout,sum_23_connectionterminated by local host,sum_23_standby,sum_23_terminate by peer user,sum_24_1,sum_25_1,sum_25_2,sum_25_UNKNOWN,sum_25_scanning timeout,sum_26_1,sum_27_1,sum_28_1,sum_30_4,sum_31_0,sum_31_1,sum_32_76,sum_32_77,sum_32_78,sum_32_79,sum_32_80,sum_32_81,sum_32_82,sum_32_83,sum_32_84,sum_32_85,sum_32_86,sum_32_87,sum_32_88,sum_32_89,sum_32_90,sum_32_91,sum_32_92,sum_32_93,sum_32_94,sum_32_95,sum_33_1,sum_33_2,sum_33_3,sum_34_1,sum_34_2,sum_34_3,sum_34_4,sum_34_5,sum_34_6,sum_35_1,sum_36_8.0,sum_37_0,sum_39_0,sum_39_1,sum_40_0,sum_40_1,sum_41_NFANDROID2,sum_42_2,sum_42_3,max_1,max_2,max_3,max_4,max_5,max_6,max_7,max_10,max_11,max_12,max_13,max_14,max_15,max_16,max_17,max_18,max_19,max_20,max_21,max_22,max_23,max_24,max_25,max_26,max_27,max_28,max_30,max_31,max_32,max_33,max_34,max_35,max_36,max_37,max_38,max_39,max_40,max_41,max_42,max_1_0,max_1_P-41007,max_1_P-44010,max_2_0,max_2_1,max_3_1,max_3_2,max_4_0,max_4_1,max_5_700001,max_5_B-A8002,max_5_C-11017,max_5_H-51042,max_5_H-51046,max_5_Q-64002,max_5_S-61001,max_5_S-64002,max_5_S-65002,max_5_V-21008,max_6_1,max_6_14,max_7_1,max_7_14,max_10_1,max_11_1,max_12_1,max_13_1,max_14_1,max_14_13,max_14_14,max_15_1,max_16_1,max_17_1,max_17_13,max_17_14,max_17_21,max_18_1,max_19_1,max_20_1,max_21_1,max_22_1,max_23_UNKNOWN,max_23_active,max_23_connection fail for LMP response timout,max_23_connection fail to establish,max_23_connection timeout,max_23_connectionterminated by local host,max_23_standby,max_23_terminate by peer user,max_24_1,max_25_1,max_25_2,max_25_UNKNOWN,max_25_scanning timeout,max_26_1,max_27_1,max_28_1,max_30_4,max_31_0,max_31_1,max_32_76,max_32_77,max_32_78,max_32_79,max_32_80,max_32_81,max_32_82,max_32_83,max_32_84,max_32_85,max_32_86,max_32_87,max_32_88,...,night_sum_25,night_sum_26,night_sum_27,night_sum_28,night_sum_31,night_sum_32,night_sum_33,night_sum_34,night_sum_35,night_sum_36,night_sum_37,night_sum_38,night_sum_39,night_sum_40,night_sum_41,night_sum_42,night_max_1,night_max_2,night_max_3,night_max_4,night_max_5,night_max_6,night_max_7,night_max_10,night_max_11,night_max_12,night_max_13,night_max_14,night_max_15,night_max_16,night_max_17,night_max_18,night_max_19,night_max_20,night_max_21,night_max_22,night_max_23,night_max_24,night_max_25,night_max_26,night_max_27,night_max_28,night_max_31,night_max_32,night_max_33,night_max_34,night_max_35,night_max_36,night_max_37,night_max_38,night_max_39,night_max_40,night_max_41,night_max_42,night_var_1,night_var_2,night_var_3,night_var_4,night_var_5,night_var_6,night_var_7,night_var_10,night_var_11,night_var_12,night_var_13,night_var_14,night_var_15,night_var_16,night_var_17,night_var_18,night_var_19,night_var_20,night_var_21,night_var_22,night_var_23,night_var_24,night_var_25,night_var_26,night_var_27,night_var_28,night_var_31,night_var_32,night_var_33,night_var_34,night_var_35,night_var_36,night_var_37,night_var_38,night_var_39,night_var_40,night_var_41,night_var_42,night_cv_1,night_cv_2,night_cv_3,night_cv_4,night_cv_5,night_cv_6,night_cv_7,night_cv_10,night_cv_11,night_cv_12,night_cv_13,night_cv_14,night_cv_15,night_cv_16,night_cv_17,night_cv_18,night_cv_19,night_cv_20,night_cv_21,night_cv_22,night_cv_23,night_cv_24,night_cv_25,night_cv_26,night_cv_27,night_cv_28,night_cv_31,night_cv_32,night_cv_33,night_cv_34,night_cv_35,night_cv_36,night_cv_37,night_cv_38,night_cv_39,night_cv_40,night_cv_41,night_cv_42,quality_count_0,quality_sum_0,quality_mean_0,quality_var_0,quality_max_0,quality_std/mean_0,quality_count_1,quality_sum_1,quality_mean_1,quality_var_1,quality_max_1,quality_std/mean_1,quality_count_2,quality_sum_2,quality_mean_2,quality_var_2,quality_max_2,quality_std/mean_2,quality_count_3,quality_count_4,quality_count_5,quality_sum_5,quality_mean_5,quality_var_5,quality_max_5,quality_std/mean_5,quality_count_6,quality_sum_6,quality_mean_6,quality_var_6,quality_max_6,quality_std/mean_6,quality_count_7,quality_sum_7,quality_mean_7,quality_var_7,quality_max_7,quality_std/mean_7,quality_count_8,quality_sum_8,quality_mean_8,quality_var_8,quality_max_8,quality_std/mean_8,quality_count_9,quality_sum_9,quality_mean_9,quality_var_9,quality_max_9,quality_std/mean_9,quality_count_10,quality_sum_10,quality_mean_10,quality_var_10,quality_max_10,quality_std/mean_10,quality_count_11,quality_sum_11,quality_mean_11,quality_var_11,quality_max_11,quality_std/mean_11,quality_count_12,quality_sum_12,quality_mean_12,quality_var_12,quality_max_12,quality_std/mean_12,05.15.2138_quality_1_max,05.15.2138_quality_7_mean,05.15.2138_quality_7_max,05.15.2138_quality_7_cv,05.15.2138_quality_8_max,05.15.2138_quality_8_cv,04.22.1750_quality_1_var,04.22.1750_quality_6_max,04.22.1750_quality_7_mean,04.22.1750_quality_7_cv,04.22.1750_quality_10_min,04.33.1261_quality_0_std,04.33.1261_quality_1_std,04.33.1261_quality_2_std,04.33.1261_quality_6_max,04.33.1261_quality_8_max,04.33.1261_quality_8_cv,04.16.3553_quality_1_max,04.16.3553_quality_5_mean,04.16.3553_quality_7_mean,04.16.3553_quality_10_max,04.16.3553_quality_10_var,03.11.1167_quality_5_max,03.11.1167_quality_6_std,03.11.1167_quality_7_mean,03.11.1167_quality_7_cv,03.11.1167_quality_8_max,03.11.1167_quality_8_cv,03.11.1167_quality_10_min,03.11.1167_quality_10_max,03.11.1167_quality_11_std,03.11.1167_quality_11_var,04.33.1185_quality_5_mean,04.33.1185_quality_5_std,04.33.1185_quality_5_var,04.33.1185_quality_6_std,04.33.1185_quality_6_var,04.33.1185_quality_7_cv,04.33.1185_quality_8_max,04.33.1185_quality_8_cv,04.33.1185_quality_10_mean,04.33.1185_quality_10_std,04.33.1185_quality_10_var,04.22.1778_quality_5_max,04.22.1778_quality_7_cv,04.22.1778_quality_10_std,changed_to_model_2,fwver_changed_count,old_ver_days,new_ver_days,model_nm,problem
0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,8.0,62.0,62.0,0.0,8.0,234.0,164.0,2.0,0.0,0.0,0.0,0.0,106.0,20.0,0.0,0.0,144.0,0.0,0.0,0.0,292.0,0.0,52.0,0.0,2.0,2.0,2.0,2.0,0.0,16.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,62.0,62.0,0.0,0.0,0.0,8.0,234.0,164.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,106.0,0.0,4.0,0.0,0.0,8.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,0.0,0.0,154.0,138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,18.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,8.0,8.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,4.0,4.0,0.0,2.0,12.0,7.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,0.0,0.0,8.0,0.0,0.0,0.0,14.0,0.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,0.0,0.0,2.0,12.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,31.0,0.0,0.0,21.0,0.0,10.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,6.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.000000,0.029385,0.000000,0.000000,0.029385,0.282828,0.282828,0.000000,0.029385,3.542700,0.352617,0.029385,0.000000,0.0,0.000000,0.0,0.877870,0.244261,0.0,0.0,0.723600,0.000000,0.000000,0.837466,0.0,0.211203,0.000000,0.000000,0.029385,0.029385,0.0,0.0,0.000000,0.000000,0.082645,0.000000,0.0,0.0,0.000000,5.656854,0.000000,0.000000,5.656854,1.595448,1.595448,0.000000,5.656854,0.862678,1.632993,5.656854,0.000000,0.0,0.000000,0.0,0.909390,2.038688,0.0,0.0,0.905527,0.000000,0.000000,1.438064,0.0,1.516575,0.000000,0.000000,5.656854,5.656854,0.0,0.0,0.000000,0.000000,3.162278,24,0.0,0.0,0.0,0.0,0.0,24,0.0,0.000000,0.000000,0.0,0.000000,24,0.0,0.0,0.0,0.0,0.0,24,24,24,7.0,0.291667,9.565972e-01,4.0,3.353341,24,0.0,0.000000,0.000000,0.0,0.000000,24,0.0,0.00,0.0000,0.0,0.000000,24,0.0,0.00,0.0000,0.0,0.000000,24,0.0,0.0,0.0,0.0,0.0,24,84.0,3.500000,2.500000e-01,4.0,0.142857,24,0.0,0.000000,0.000000,0.0,0.000000,24,0.0,0.000,0.000000,0.0,0.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.000000,0.0,0.0,0.0,3.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,4.0,0.0,0.0,0.0,1.0,19.0,14.0,model_0,0.0
1,0.0,0.0,0.0,18.0,8.0,4.0,2.0,0.0,30.0,30.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,4.0,0.0,2.0,0.0,0.0,30.0,30.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.440771,0.106520,0.117539,0.029385,0.000000,0.247934,0.247934,0.000000,1.057851,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2.434322,2.692582,5.656854,5.656854,0.000000,1.095445,1.095445,0.000000,5.656854,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.000,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,-999.0,-999.0,-999.0,0.0,0.0,0.0,32.0,model_3,0.0
2,0.0,0.0,0.0,0.0,14.0,30.0,30.0,18.0,50.0,68.0,24.0,40.0,296.0,232.0,2.0,2.0,0.0,2.0,0.0,206.0,154.0,0.0,0.0,96.0,10.0,10.0,0.0,392.0,0.0,34.0,4.0,6.0,4.0,4.0,2.0,0.0,84.0,166.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,10.0,0.0,0.0,30.0,0.0,30.0,18.0,50.0,68.0,24.0,0.0,14.0,26.0,296.0,232.0,0.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,206.0,0.0,24.0,0.0,0.0,70.0,2.0,56.0,2.0,0.0,0.0,0.0,0.0,0.0,96.0,10.0,10.0,0.0,208.0,184.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,22.0,8.0,4.0,0.0,0.0,0.0,0.0,0.0,6.0,4.0,4.0,0.0,0.0,42.0,42.0,166.0,0.0,8.0,0.0,0.0,0.0,0.0,6.0,7.0,7.0,8.0,11.0,14.0,10.0,9.0,23.0,21.0,1.0,1.0,0.0,1.0,0.0,13.0,10.0,0.0,0.0,7.0,5.0,5.0,0.0,45.0,0.0,5.0,2.0,2.0,1.0,1.0,1.0,0.0,24.0,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,0.0,0.0,7.0,0.0,7.0,8.0,11.0,14.0,10.0,0.0,3.0,6.0,23.0,21.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,13.0,0.0,4.0,0.0,0.0,7.0,1.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,5.0,0.0,23.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,39.0,3.0,3.0,118.0,0.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,18.0,38.0,4.0,0.0,0.0,0.0,0.0,3.0,2.0,2.0,2.0,4.0,4.0,7.0,3.0,11.0,13.0,0.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0,0.0,4.0,3.0,3.0,27.0,0.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,7.0,1.0,0.000000,0.0,0.0,0.000000,0.370983,0.348944,0.348944,0.143251,0.674013,0.949495,1.439853,0.486685,7.693297,6.514233,0.000000,0.000000,0.0,0.000000,0.0,4.916437,3.153352,0.0,0.0,1.300275,0.264463,0.264463,25.274564,0.0,0.492195,0.000000,0.000000,0.029385,0.029385,0.0,0.0,1.096419,3.764922,0.106520,0.000000,0.0,0.0,0.000000,4.019950,2.784798,2.784798,4.163332,1.593673,1.461630,5.656854,2.877716,0.906252,1.257103,0.000000,0.000000,0.0,0.000000,0.0,0.881579,1.010351,0.0,0.0,0.964866,5.656854,5.656854,1.405963,0.0,1.446980,0.000000,0.000000,5.656854,5.656854,0.0,0.0,1.919684,1.685033,2.692582,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.000,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,-999.0,-999.0,-999.0,0.0,1.0,19.0,14.0,model_0,1.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,30.0,30.0,0.0,0.0,190.0,98.0,2.0,0.0,0.0,0.0,0.0,16.0,2.0,0.0,0.0,14.0,0.0,0.0,0.0,200.0,0.0,30.0,0.0,0.0,2.0,2.0,0.0,0.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,30.0,0.0,0.0,0.0,0.0,190.0,98.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,98.0,102.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,40.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,10.0,7.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,14.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,10.0,7.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,69.0,0.0,15.0,0.0,0.0,1.0,1.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,7.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.029385,0.000000,0.000000,0.000000,0.247934,0.247934,0.000000,0.000000,3.875115,0.833792,0.029385,0.000000,0.0,0.000000,0.0,0.183655,0.029385,0.0,0.0,0.106520,0.000000,0.000000,2.991736,0.0,0.247934,0.000000,0.000000,0.029385,0.029385,0.0,0.0,0.541781,0.000000,0.000000,0.000000,0.0,0.0,0.000000,5.656854,0.000000,0.000000,0.000000,1.095445,1.095445,0.000000,0.000000,0.822298,1.039070,5.656854,0.000000,0.0,0.000000,0.0,1.767767,5.656854,0.0,0.0,2.692582,0.000000,0.000000,0.827230,0.0,1.095445,0.000000,0.000000,5.656854,5.656854,0.0,0.0,1.868455,0.000000,0.000000,75,0.0,0.0,0.0,0.0,0.0,75,0.0,0.000000,0.000000,0.0,0.000000,75,0.0,0.0,0.0,0.0,0.0,120,120,75,9419.0,125.586667,2.481485e+05,3062.0,3.966544,75,0.0,0.000000,0.000000,0.0,0.000000,120,0.0,0.00,0.0000,0.0,0.000000,120,0.0,0.00,0.0000,0.0,0.000000,120,0.0,0.0,0.0,0.0,0.0,120,113028.0,941.900000,1.672485e+06,3352.0,1.373018,75,1.0,0.013333,0.013156,1.0,8.602325,120,12.0,0.100,0.090000,1.0,3.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,0.478301,0.478301,0.478301,0.0,0.0,0.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,0.305556,2.702585,7.303968,0.503953,0.253968,0.00000,0.0,0.0,9.00,8.605646,7.405714e+01,-999.0,-999.0,-999.0,0.0,1.0,10.0,23.0,model_2,1.0
4,0.0,0.0,0.0,158.0,2.0,0.0,0.0,0.0,58.0,58.0,0.0,0.0,112.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,158.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,58.0,0.0,0.0,0.0,0.0,112.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2.348944,0.029385,0.000000,0.000000,0.000000,0.370983,0.370983,0.000000,0.000000,1.300275,0.481175,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.655647,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.366937,5.656854,0.000000,0.000000,0.000000,1.256234,1.256234,0.000000,0.000000,1.393695,1.760850,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,1.272418,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.000,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,-999.0,-999.0,-999.0,0.0,0.0,0.0,32.0,model_3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,122.0,24.0,22.0,0.0,32.0,34.0,4.0,56.0,218.0,150.0,0.0,2.0,0.0,2.0,0.0,72.0,36.0,4.0,0.0,96.0,2.0,2.0,2.0,286.0,0.0,32.0,0.0,4.0,2.0,2.0,0.0,0.0,248.0,82.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,24.0,0.0,22.0,0.0,32.0,34.0,4.0,0.0,4.0,52.0,218.0,150.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,72.0,0.0,10.0,0.0,0.0,6.0,0.0,20.0,0.0,4.0,0.0,0.0,0.0,0.0,96.0,2.0,2.0,2.0,146.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,26.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,2.0,0.0,0.0,126.0,122.0,82.0,0.0,16.0,0.0,0.0,0.0,0.0,40.0,9.0,8.0,0.0,3.0,4.0,2.0,13.0,14.0,9.0,0.0,1.0,0.0,1.0,0.0,5.0,4.0,2.0,0.0,6.0,1.0,1.0,1.0,19.0,0.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,27.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,0.0,8.0,0.0,3.0,4.0,2.0,0.0,1.0,12.0,14.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,0.0,3.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,1.0,1.0,1.0,10.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,22.0,1.0,1.0,60.0,0.0,16.0,0.0,1.0,1.0,1.0,0.0,0.0,77.0,18.0,8.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0,3.0,2.0,3.0,6.0,3.0,0.0,1.0,0.0,1.0,0.0,4.0,3.0,0.0,0.0,3.0,1.0,1.0,7.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,14.0,3.0,1.0,0.000000,0.0,0.0,0.000000,0.167126,0.189164,0.189164,0.000000,0.310376,0.431589,0.117539,0.440771,3.928375,0.712580,0.000000,0.029385,0.0,0.029385,0.0,0.991736,0.413223,0.0,0.0,0.888889,0.029385,0.029385,3.845730,0.0,0.310376,0.000000,0.029385,0.029385,0.029385,0.0,0.0,10.404040,0.732782,0.183655,0.000000,0.0,0.0,0.000000,3.372684,2.870540,2.870540,0.000000,1.149049,1.275264,5.656854,2.434322,0.838544,1.071414,0.000000,5.656854,0.0,5.656854,0.0,1.095445,1.767767,0.0,0.0,1.414214,5.656854,5.656854,1.078579,0.0,1.149049,0.000000,5.656854,5.656854,5.656854,0.0,0.0,1.382370,1.569383,1.767767,61,0.0,0.0,0.0,0.0,0.0,61,0.0,0.000000,0.000000,0.0,0.000000,61,0.0,0.0,0.0,0.0,0.0,96,96,61,39876.0,653.704918,2.880731e+06,8323.0,2.596388,61,2.0,0.032787,0.064499,2.0,7.745967,96,24.0,0.25,0.4375,2.0,2.645751,96,0.0,0.00,0.0000,0.0,0.000000,96,0.0,0.0,0.0,0.0,0.0,96,478512.0,4984.500000,2.395240e+07,14349.0,0.981868,61,1.0,0.016393,0.016125,1.0,7.745967,96,12.0,0.125,0.109375,1.0,2.645751,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,0.308709,0.308709,0.308709,0.0,0.0,0.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,94.270833,373.743497,139684.201684,0.613096,0.375887,1.75038,0.0,0.0,1138.75,1252.459172,1.568654e+06,-999.0,-999.0,-999.0,0.0,1.0,11.0,22.0,model_2,1.0
14996,0.0,0.0,0.0,0.0,76.0,0.0,2.0,2.0,28.0,28.0,0.0,2.0,456.0,378.0,0.0,2.0,0.0,2.0,0.0,122.0,66.0,0.0,0.0,156.0,0.0,0.0,0.0,746.0,0.0,28.0,470.0,0.0,2.0,2.0,0.0,0.0,368.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,28.0,28.0,0.0,0.0,0.0,2.0,456.0,378.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,122.0,0.0,38.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,156.0,0.0,0.0,0.0,388.0,358.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,22.0,2.0,40.0,168.0,252.0,10.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,174.0,194.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,9.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,22.0,20.0,0.0,1.0,0.0,1.0,0.0,14.0,13.0,0.0,0.0,8.0,0.0,0.0,0.0,41.0,0.0,2.0,191.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0,22.0,20.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,14.0,0.0,12.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,21.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,72.0,0.0,0.0,208.0,0.0,13.0,137.0,0.0,1.0,1.0,0.0,0.0,136.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0,9.0,6.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,6.0,0.0,0.0,12.0,0.0,2.0,66.0,0.0,1.0,1.0,0.0,0.0,12.0,0.0,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.299357,0.299357,0.000000,0.029385,6.514233,2.767677,0.000000,0.000000,0.0,0.000000,0.0,1.289256,0.365473,0.0,0.0,2.209366,0.000000,0.000000,10.150597,0.0,0.299357,202.189164,0.000000,0.029385,0.029385,0.0,0.0,7.500459,0.000000,0.198347,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.388882,1.388882,0.000000,5.656854,0.543393,0.499090,0.000000,0.000000,0.0,0.000000,0.0,0.892143,1.424996,0.0,0.0,0.681264,0.000000,0.000000,0.505471,0.0,1.388882,3.425093,0.000000,5.656854,5.656854,0.0,0.0,0.664537,0.000000,1.632993,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.00,-999.0000,-999.0,-999.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,0,-999.0,-999.000000,-9.990000e+02,-999.0,-999.000000,0,-999.0,-999.000000,-999.000000,-999.0,-999.000000,0,-999.0,-999.000,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,-999.0,-999.0,-999.0,0.0,1.0,3.0,30.0,model_0,1.0
14997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,30.0,0.0,0.0,262.0,184.0,0.0,0.0,0.0,0.0,0.0,56.0,8.0,0.0,0.0,64.0,0.0,0.0,0.0,306.0,0.0,30.0,0.0,0.0,2.0,2.0,0.0,0.0,360.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,30.0,0.0,0.0,0.0,0.0,262.0,184.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0,166.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,178.0,182.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,16.0,13.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,21.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,16.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,11.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,15.0,0.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.247934,0.247934,0.000000,0.000000,2.486685,0.082645,0.000000,0.000000,0.0,0.000000,0.0,0.674013,0.106520,0.0,0.0,0.000000,0.000000,0.000000,0.249770,0.0,0.247934,0.000000,0.000000,0.029385,0.029385,0.0,0.0,0.209366,0.000000,0.082645,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.095445,1.095445,0.000000,0.000000,1.000739,3.162278,0.000000,0.000000,0.0,0.000000,0.0,0.967587,2.692582,0.0,0.0,0.000000,0.000000,0.000000,3.298485,0.0,1.095445,0.000000,0.000000,5.656854,5.656854,0.0,0.0,2.516611,0.000000,3.162278,21,0.0,0.0,0.0,0.0,0.0,21,0.0,0.000000,0.000000,0.0,0.000000,21,0.0,0.0,0.0,0.0,0.0,24,24,21,8.0,0.380952,1.473923e+00,5.0,3.186887,21,0.0,0.000000,0.000000,0.0,0.000000,24,0.0,0.00,0.0000,0.0,0.000000,24,0.0,0.00,0.0000,0.0,0.000000,24,0.0,0.0,0.0,0.0,0.0,24,96.0,4.000000,1.000000e+00,5.0,0.250000,21,0.0,0.000000,0.000000,0.0,0.000000,24,0.0,0.000,0.000000,0.0,0.000000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.204545,0.0,0.0,0.0,3.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,5.0,0.0,0.0,0.0,1.0,24.0,9.0,model_0,0.0
14998,34.0,0.0,0.0,90.0,18.0,0.0,0.0,4.0,64.0,68.0,2.0,6.0,154.0,116.0,72.0,0.0,0.0,0.0,0.0,92.0,48.0,0.0,0.0,152.0,82.0,80.0,0.0,172.0,0.0,48.0,0.0,0.0,2.0,2.0,0.0,0.0,52.0,128.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,16.0,0.0,16.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,64.0,68.0,2.0,0.0,2.0,4.0,154.0,116.0,70.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,92.0,0.0,46.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,152.0,82.0,80.0,0.0,104.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,26.0,26.0,128.0,0.0,0.0,4.0,0.0,0.0,12.0,3.0,0.0,0.0,1.0,4.0,5.0,1.0,1.0,16.0,10.0,14.0,0.0,0.0,0.0,0.0,9.0,7.0,0.0,0.0,10.0,15.0,16.0,0.0,15.0,0.0,5.0,0.0,0.0,1.0,1.0,0.0,0.0,12.0,10.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,1.0,0.0,1.0,1.0,16.0,10.0,14.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,15.0,16.0,0.0,8.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,34.0,19.0,20.0,14.0,0.0,5.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,29.0,0.0,2.0,0.0,0.0,6.0,1.0,0.0,0.0,1.0,2.0,3.0,1.0,1.0,5.0,2.0,3.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,4.0,8.0,8.0,6.0,0.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,4.0,0.0,0.271809,0.0,0.0,1.504132,0.056933,0.000000,0.000000,0.029385,0.209366,0.348944,0.029385,0.056933,1.355372,0.319559,0.525253,0.000000,0.0,0.000000,0.0,1.329660,1.158861,0.0,0.0,1.241506,2.850321,3.026630,1.395776,0.0,0.370983,0.000000,0.000000,0.029385,0.029385,0.0,0.0,0.128558,1.197429,0.000000,1.720465,0.0,0.0,3.372684,3.937004,0.000000,0.000000,5.656854,2.516611,2.784798,5.656854,3.937004,1.067187,2.072751,2.174229,0.000000,0.0,0.000000,0.0,1.902630,2.089685,0.0,0.0,1.081457,2.932292,2.870540,2.784798,0.0,4.019950,0.000000,0.000000,5.656854,5.656854,0.0,0.0,2.366432,1.245205,0.000000,0,-999.0,-999.0,-999.0,-999.0,-999.0,187,1.0,0.005348,0.005319,1.0,13.638182,187,0.0,0.0,0.0,0.0,0.0,240,240,187,3.0,0.016043,2.648060e-02,2.0,10.143416,187,0.0,0.000000,0.000000,0.0,0.000000,240,0.0,0.00,0.0000,0.0,0.000000,240,12.0,0.05,0.0475,1.0,4.358899,240,0.0,0.0,0.0,0.0,0.0,240,36.0,0.150000,2.275000e-01,2.0,3.179797,187,37.0,0.197861,0.394006,5.0,3.172425,240,444.0,1.850,0.927500,5.0,0.520577,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,-999.0,-999.000000,-999.0,-999.0,-999.000000,2.0,0.415675,0.0,0.0,1.0,4.368008,0.0,2.0,0.745792,0.556206,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.00000,-999.0,-999.0,-999.00,-999.000000,-9.990000e+02,-999.0,-999.0,-999.0,1.0,2.0,0.0,0.0,model_4,1.0


## pearson 연관관계 분석을 통한 feature_selection

In [24]:
if global_parameters['pearson_cutoff'] <= 1:
    org_train_full = train_full
    pearson_table = train_full.corr(method='pearson')
    pearson = []
    for i in train_full.columns:
        for j in train_full.columns:
            if (i >= j):
                continue
            if str(train_full[i].dtype) == 'category' or str(train_full[j].dtype) == 'category':
                continue
            pearson.append((i,j,pearson_table.loc[i,j]))
    pearson = sorted(pearson, key=lambda x: abs(x[2]), reverse=True)

    remove_columns = set()
    fix_columns = set()
    for i in pearson:
        # 이미 제거되었으면 넘기기
        if (i[0] in remove_columns or i[1] in remove_columns):
            continue
        if (abs(i[2]) >= global_parameters['pearson_cutoff']):
            remove_columns.add(i[1])

    print(len(train_full.columns), '개의 칼럼을', len(train_full.columns) - len(remove_columns), '개로 압축')
    train_full = train_full[train_full.columns.difference(list(remove_columns))]
    test = test[test.columns.difference(list(remove_columns))]


In [25]:
#train_full 데이터를 train, valid로 나눔 (valid는 점수 검증용으로)
def make_train_val(data, r=0.8):
    min_user_id = data.index.min()
    max_user_id = data.index.max()
    m = int((1 - r) * min_user_id + r * max_user_id)
    return data.loc[data.index < m], data.loc[data.index >= m]

train, valid = make_train_val(train_full)

In [26]:
def reset_bad_features():
    global bad_features
    bad_features = []

def add_bad_features(col):
    global bad_features
    bad_features.append(col)
    bad_features = list(set(bad_features))
    
reset_bad_features()

# 2. 모델 생성

- LGBM 모델을 사용하여 주어진 데이터를 학습함.
- 앙상블 학습을 통해 예측 성능 향상
- StratifiedKFold를 사용하여 valid set을 만들때 label 비율 유지
- training data에 대해서는 SMOTENC라는 oversampling 기법을 사용하여 데이터의 불균형을 해소
    - SMOTENC는 SMOTE에서 범주형 데이터를 사용할 수 있도록 수정된 알고리즘
- 학습에 충분한 시간이 있다면 global_parameters['lgbm_seed_ensemble']의 값을 늘려 seed의 다양성을 주어 성능을 향상시킬 수 있음

In [27]:
def reset_fold_cache():
    global fold_cache
    fold_cache = {}
reset_fold_cache()

In [28]:
# LGBM + Soft voting ensemble model (sklearn interface)
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from multiprocessing import Pool 
import workers

class LGBMEnsembleModel(object):
    def __init__(self, params, smote=True, folds=10, early_stopping_rounds=200, random_state=42):
        self.early_stopping_rounds = early_stopping_rounds
        self.params = params
        self.smote = smote
        self.folds = folds
        self.random_state = random_state
        self.models = []
        self.cv_scores = []
            
    def get_params(self, deep=True):
        return {'params': self.params, 'folds': self.folds, 'random_state': self.random_state, 'early_stopping_rounds': self.early_stopping_rounds}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def fit(self, X, y):
        self._default_params()
        self.models = []
        self.cv_scores = []
        
        datas = []
        original_seed = self.random_state
        for i in range(0, global_parameters['lgbm_seed_ensemble']):
            datas += self.load_fold_cache(X, y)
            self.random_state += 10000
        self.random_state = original_seed
        
        for i in datas:
            train_x = i['train_x']
            train_y = i['train_y']
            valid_x = i['valid_x']
            valid_y = i['valid_y']
            d_train = lgb.Dataset(train_x, train_y, silent=True, free_raw_data=False, params={'verbose': -1})
            d_val  = lgb.Dataset(valid_x, valid_y, silent=True, free_raw_data=False, params={'verbose': -1})
            clf = lgb.train(params=self.params, train_set=d_train, valid_sets=[d_train, d_val], verbose_eval=False, early_stopping_rounds=self.early_stopping_rounds)
            val_pred = clf.predict(valid_x)
            auc_score = roc_auc_score(valid_y, val_pred)
            self.cv_scores.append(auc_score)
            self.models.append(clf)
        return self
    
    def predict(self, X):
        y_pred = []
        for model in self.models:
            y_pred.append(model.predict(X))
            
        pred_ensemble = np.mean(y_pred, axis = 0)
        return pred_ensemble

    def score(self, X, y):
        val_pred = self.predict(X)
        auc_score = roc_auc_score(y, val_pred)
        return auc_score
    
    def _default_params(self):
        int_params = ['max_depth', 'num_leaves', 'num_iterations', 'min_data_in_leaf', 'max_bin', 'min_data_in_bin', 'n_estimators']
        for i in int_params:
            if i in self.params:
                self.params[i] = int(self.params[i])
            
        self.params['seed'] = 1015
        self.params['metric'] = 'auc'
        self.params['objective'] = 'binary'
        self.params['num_iterations'] = 1000000
        self.params['verbose'] = -1
        self.params['num_threads'] = 20
        
    def load_fold_cache(self, X, y):
        global fold_cache
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
            
        if isinstance(y, np.ndarray):
            y = pd.DataFrame(y)
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        hashcode = str(self.random_state) + str(X.shape) + str(self.folds) + str(X.columns) + str(X.iloc[0]) + pd.util.hash_pandas_object(X).astype(str).sum()
        if hashcode not in fold_cache:
            categorical_features = [X.columns.get_loc(i) for i in X.select_dtypes(exclude=['int', 'int64', 'float']).columns]
            output = []
            folds = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=self.random_state)
            if global_parameters['multiprocessing_for_smote']:
                task = []
                for train_idx, val_idx in folds.split(X, y):
                    train_x = X.loc[train_idx]
                    train_y = y.loc[train_idx]
                    valid_x = X.loc[val_idx]
                    valid_y = y.loc[val_idx]
                    if len(categorical_features) > 0:
                        smote = SMOTENC(random_state=self.random_state, categorical_features = categorical_features, n_jobs = 20)
                    else:
                        smote = SMOTE(random_state=self.random_state, n_jobs = 20)
                    task.append(({'train_x': train_x,'train_y': train_y, 'valid_x': valid_x, 'valid_y': valid_y}, smote))
                p=Pool(processes = 10) 
                output = p.map(workers.worker,task)
            else:
                for train_idx, val_idx in folds.split(X, y):
                    train_x = X.loc[train_idx]
                    train_y = y.loc[train_idx]
                    valid_x = X.loc[val_idx]
                    valid_y = y.loc[val_idx]
                    smote = SMOTENC(random_state=self.random_state, categorical_features = categorical_features, n_jobs = 20)
                    train_x, train_y = smote.fit_sample(train_x, train_y)
                    output.append({'train_x': train_x,'train_y': train_y, 'valid_x': valid_x, 'valid_y': valid_y})
            fold_cache[hashcode] = output
        return fold_cache[hashcode]

In [29]:
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance

def lgbm_train(data, params = {}, valid=valid):
    if 'boosting_list' in params:
        temp = params['boosting_list']
        for i in temp.keys():
            params[i] = temp[i]
        del(params['boosting_list'])

    x = data[data.columns.difference(['problem'] + bad_features)]
    y = data['problem']
    
    test_x = valid.loc[:, valid.columns.difference(['problem'] + bad_features)]
    test_y = valid['problem']
    clf = LGBMEnsembleModel(params, folds=global_parameters['nfold'])
    clf.fit(x,y)
    return {'model': clf, 'auc': clf.cv_scores, 'test auc': clf.score(test_x, test_y)}

# 3. 스코어 확인용 함수 생성

- 앙상블 러닝에서 이미 kfold를 사용하였기 때문에, CV를 한번 더 적용한 중첩 k-fold를 이용하여 모델 성능을 측정함

In [30]:
def global_score(params = {}, use_tqdm = True):
    scores = []
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if use_tqdm:
        for train_idx, val_idx in tqdm(folds.split(train_full, train_full['problem']), leave=False):
            scores.append(lgbm_train(train_full.loc[train_idx].reset_index(), params=params, valid=train_full.loc[val_idx].reset_index())['test auc'])
    else:
        for train_idx, val_idx in folds.split(train_full, train_full['problem']):
            scores.append(lgbm_train(train_full.loc[train_idx].reset_index(), params=params, valid=train_full.loc[val_idx].reset_index())['test auc'])
    return np.mean(scores)

In [31]:
# 기본 성능 테스트 퀄리티 - ( + var + cv -std)  +++ 낮 / 밤 (sum, max, var , cv) + 퀄리티 세트 수정
import eli5
reset_bad_features()
add_bad_features('old_ver_days')
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8545239503252513, 0.8337792045657214, 0.8540733857114696, 0.8324713156173831, 0.8484218750000001, 0.834846875, 0.8255718750000001, 0.8263375, 0.826296875, 0.8485307017543859]
deafult Test - AUC 0.8553510005084796


                  

KeyboardInterrupt: 

In [32]:
# 기본 성능 테스트 퀄리티 - ( + var + cv -std)  +++ 낮 / 밤 (sum, max, var , cv) + 퀄리티 세트 수정
import eli5
reset_bad_features()
add_bad_features('old_ver_days')
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8525621169027436, 0.8374243974480521, 0.8569864110964052, 0.8271709235635907, 0.84629375, 0.8279062500000001, 0.8246875, 0.8304624999999999, 0.831340625, 0.8492387218045112]
deafult Test - AUC 0.8533270521141786


                        

global AUC 0.8460532000000001




In [None]:
# 기본 성능 테스트 퀄리티 - ( + var + cv -std)  +++ 낮 / 밤 (sum, max, var , cv)
import eli5
reset_bad_features()
add_bad_features('old_ver_days')
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

In [33]:
# 기본 성능 테스트 퀄리티 - + var + cv -std
import eli5
reset_bad_features()
add_bad_features('old_ver_days')
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8525339566143825, 0.8362823413089528, 0.8520536672517748, 0.8281940807073864, 0.84430625, 0.83075625, 0.81924375, 0.828509375, 0.8175593749999999, 0.8455388471177945]
deafult Test - AUC 0.8521939401190441


                        

global AUC 0.8439278




In [None]:
# 기본 성능 테스트 30 에러 + 분산 + cv - std - old_ver_days
import eli5
reset_bad_features()
add_bad_features('old_ver_days')
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

In [148]:
# 기본 성능 분산 + cv - std - 30에러
import eli5
reset_bad_features()
for col in train_full.columns:
    if col.startswith('custom_'):
        add_bad_features(col)
print(bad_features)
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

['custom_max_30(밤)', 'custom_std_30(밤)', 'custom_sum_30(밤)']


0it [00:00, ?it/s]

deafult CV - AUC [0.8530345839630287, 0.8318987230873689, 0.8499885794386091, 0.8302841998879846, 0.84728125, 0.8346750000000001, 0.8183250000000001, 0.829303125, 0.81930625, 0.8517606516290727]
deafult Test - AUC 0.8513180590035793


                        

global AUC 0.8437254




In [145]:
# 기본 성능 테스트 30 에러 + 분산 + cv - std
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8530345839630287, 0.8318987230873689, 0.8499885794386091, 0.8302841998879846, 0.84728125, 0.8346750000000001, 0.8183250000000001, 0.829303125, 0.81930625, 0.8517606516290727]
deafult Test - AUC 0.8513180590035793


                        

global AUC 0.8437402




In [130]:
# 기본 성능 테스트 30 에러 + 분산 + cv
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8552655045854336, 0.8366891010297278, 0.8489403909273807, 0.8242328668112229, 0.843565625, 0.833846875, 0.8181906250000001, 0.8244843749999999, 0.81959375, 0.8451942355889723]
deafult Test - AUC 0.8517796787605061


                        

global AUC 0.8436326




In [115]:
# 기본 성능 테스트 30 에러 + 분산
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", 4133141313413314r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8492642342435365, 0.8336790790959923, 0.8491296906435877, 0.8350996091977759, 0.8412968749999998, 0.8340281250000001, 0.816225, 0.83214375, 0.81268125, 0.8427224310776942]
deafult Test - AUC 0.8522682180280959


                        

global AUC 0.8434794




In [173]:
# 기본 성능 테스트 30 타입 에러 + 펌웨어 정보를 카테고리컬 데이터로 추가(5개) 사용일
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.849398777843485, 0.8319519147431624, 0.849793021880544, 0.8296208686510282, 0.85005, 0.8309593750000002, 0.815925, 0.830121875, 0.812746875, 0.8478790726817043]
deafult Test - AUC 0.851156043430144


                        

global AUC 0.8430538000000001




In [142]:
# 기본 성능 테스트 30 타입 에러 + 펌웨어 정보를 카테고리컬 데이터로 추가(5개) 사용시작일자
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8507755030522623, 0.8337353996727148, 0.8496772518061695, 0.8324024793569441, 0.8432562499999999, 0.8299499999999999, 0.8184250000000001, 0.832303125, 0.81343125, 0.842924498746867]
deafult Test - AUC 0.852626147818024


                        

global AUC 0.8427949




In [75]:
# 기본 성능 테스트 30 타입 에러 + 펌웨어 정보를 카테고리컬 데이터로 추가(5개) 정보 없는경우 마지막 펌웨어 복사
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8539638734789534, 0.834154675077206, 0.8502967781501194, 0.8338730721935927, 0.846528125, 0.8365625000000001, 0.8209656249999999, 0.8270906250000001, 0.8171625000000002, 0.8447556390977444]
deafult Test - AUC 0.8512168615838642


                        

global AUC 0.8432131999999999




In [59]:
# 기본 성능 테스트 30 타입 에러 + 펌웨어 정보를 숫자형 데이터로 추가(5개)
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8499275654804928, 0.8298743112462803, 0.8520708763168846, 0.8293924574232086, 0.8453750000000001, 0.8312843750000001, 0.821809375, 0.8292, 0.813825, 0.8455639097744361]
deafult Test - AUC 0.851338497891305


                        

global AUC 0.8431006




In [41]:
# 기본 성능 테스트 30 타입 에러 + 펌웨어 정보를 카테고리컬 데이터로 추가(5개)
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.856085281868842, 0.8313793222131484, 0.8473868816861128, 0.8277810631447533, 0.8445687499999999, 0.828934375, 0.818828125, 0.8291999999999999, 0.81148125, 0.8388533834586467]
deafult Test - AUC 0.8509207469665699


                        

global AUC 0.8431557999999999




In [75]:
# 기본 성능 테스트 30타입 에러만 추가
import eli5
reset_bad_features()
r = lgbm_train(train)
print("deafult CV - AUC", r['auc'])
print("deafult Test - AUC", r['test auc'])
print("global AUC", global_score())

0it [00:00, ?it/s]

deafult CV - AUC [0.8514482210520059, 0.8327028557661319, 0.8509084821917465, 0.8298868269299966, 0.8446, 0.8323656249999999, 0.8194062499999999, 0.8329625, 0.815109375, 0.8438784461152882]
deafult Test - AUC 0.8519845661472198


                        

global AUC 0.8431074000000001




# - 하이퍼 파라미터를 튜닝하지 않을 땐 바로 csv 생성 단계로 건너뛰기

# 4. (옵션) 파라미터 튜닝 함수

## PermutationImportance를 통한 bad_features 탐색

- 'model_nm'은 범주형 데이터라 임시로 제외됨
- PermutationImportance 를 통해 변수가 정확도에 얼마나 영향을 주는지 측정할 수 있음
- (-) 값인 경우 해당 columns의 data를 섞었을 때 더 정확도가 높게 나온 것임
- 이 정확도를 기반으로 logspace를 통해 적절한 score cutoff 지점을 찾음

- 결과 파일을 직접 생성하고 싶을 땐 아래 코드 실행

In [70]:
from eli5.sklearn import PermutationImportance
'''
clf = LGBMEnsembleModel(params = {}, folds=global_parameters['nfold'])
train_full_x = train_full[train_full.columns.difference(['problem', 'model_nm'] + bad_features)]
train_full_y = train_full['problem']
perm = PermutationImportance(clf, random_state = 42, n_iter = 10, cv=10).fit(train_full_x, train_full_y)
perm_df = pd.DataFrame()
perm_df['column'] = train_full_x.columns
perm_df['score'] = perm.feature_importances_
perm_df['score_std'] = perm.feature_importances_std_
perm_df.to_csv('perm_df.csv', index=False)
'''

In [95]:
perm_df = pd.read_csv('perm_df.csv')

In [198]:
eli5.show_weights(perm, top = 50, feature_names = train_full_x.columns.tolist())

Weight,Feature
0.0742  ± 0.0157,max_17
0.0141  ± 0.0101,sum_32_2
0.0097  ± 0.0073,max_22_terminate by peer user
0.0088  ± 0.0050,max_29
0.0069  ± 0.0067,10_32_2
0.0055  ± 0.0052,used_days
0.0031  ± 0.0035,max_23
0.0029  ± 0.0037,max_22_standby
0.0029  ± 0.0040,max_11
0.0024  ± 0.0038,max_15


In [None]:
cutoff_space = []
cutoff_result = []
# 만약 1000개의 col이 있으면 log를 통해 cut off 지점을 정함 (ex [1,2,3,5,8,10,25,66,150,450,899])
for i in np.logspace(0, np.log10(len(perm_df)), num=50, endpoint=False, base=10.0)[35:]:
    # i를 날림
    # cutoff 지점에 해당하는 score 값을 입력
    
    cutoff = perm_df.sort_values(by='score')['score'].values[int(i)]
    if last_cutoff == cutoff:
        continue
    else:
        last_cutoff = cutoff
    cutoff_space.append(cutoff)

last_cutoff = 0
for cutoff in tqdm(cutoff_space):
    remove_columns = perm_df.loc[perm_df['score'] < cutoff]['column'].values
    reset_bad_features()
    for col in remove_columns:
        add_bad_features(col)
    score = global_score()
    cutoff_result.append(score)
    print(cutoff, score)

In [193]:
for i in range(0, len(cutoff_result)):
    print(i, cutoff_result[i])

## 하이퍼 파라미터 검색
- hyperout 라이브러리를 사용


In [26]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
save_params = []
def _find_hyper_params(params):
    score = global_score(params, use_tqdm=False)
    save_params.append([score, params])
    return -score
def find_hyper_params(trials, max_evals=10):
    boosting_list = [
        {'boosting': 'gbdt'},
        {'boosting': 'goss', 'top_rate': hp.uniform('top_rate', 0, 0.5), 'other_rate': hp.uniform('other_rate', 0, 0.5)}
    ]

    search_space = {
            #'boosting_list' : hp.choice('boosting_list', boosting_list),
            'num_iterations':  hp.quniform("num_iterations", 100, 1000, 1),
            'max_depth':  hp.quniform("max_depth", 6, 20, 1),
            'num_leaves':  hp.quniform("num_leaves", 20, 150, 1),
            'max_bin': hp.quniform('max_bin', 64, 512, 1),

            'min_data_in_leaf': hp.quniform('min_data_in_leaf', 10, 1000, 1),
            #'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
            #'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),

            #'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
            #'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
            'n_estimators': hp.quniform('n_estimators', 500, 2000, 1),
            'feature_fraction' : hp.quniform('feature_fraction', 0.7, 1, 0.01),
            'bagging_fraction' : hp.quniform('bagging_fraction', 0.7, 1, 0.01),
            'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)), 
    }

    best=fmin(fn=_find_hyper_params, # function to optimize
              space=search_space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=max_evals, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.RandomState(42) # fixing random state for the reproducibility
             )
                               

    #best['boosting'] = boosting_list[best['boosting_list']]['boosting']
    #del(best['boosting_list'])
    return best

In [27]:
trials = Trials()

In [None]:
reset_bad_features()
n_iters = 200 # 이 횟수동안 파라미터를 변경하면서 최적의 값을 찾음
best_params = find_hyper_params(trials, n_iters)
r = lgbm_train(train, params=best_params)
print("hyperopt CV - AUC", r['auc'])
print("hyperopt Test - AUC", r['test auc'])
print("hyperopt global score - AUC", global_score(best_params))

In [35]:
sorted(save_params, key=lambda x: x[0], reverse=True)[0]

[0.8442474000000001,
 {'bagging_fraction': 0.77,
  'feature_fraction': 0.87,
  'learning_rate': 0.020031706952922917,
  'max_bin': 280,
  'max_depth': 15,
  'min_data_in_leaf': 113,
  'n_estimators': 1917,
  'num_iterations': 1000000,
  'num_leaves': 55,
  'seed': 1015,
  'metric': 'auc',
  'objective': 'binary',
  'verbose': -1,
  'num_threads': 20}]

# 5. 제출용 CSV 생성
- 사용 가능한 모든 training data를 사용함

In [173]:
'''
remove_columns = perm_df.loc[perm_df['score'] < cutoff_space[11]]['column'].values
reset_bad_features()
for col in remove_columns:
    add_bad_features(col)
'''

In [179]:
# sorted(save_params, key=lambda x: x[0], reverse=True)

In [33]:
print(bad_features)

['old_ver_days']


In [32]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
submit_model = lgbm_train(train_full)['model']

sample_submssion['problem'] = submit_model.predict(test[test.columns.difference(['problem'] + bad_features)])
sample_submssion.to_csv("LGBM (최종).csv", index = False)