지도학습 기초 - 1. 분류

(1) 환경 설정

In [None]:
# 1. 패키지 임포트

#from C510_01_supervised_learning_classifier import *

import numpy
import os
import pandas
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import warnings
from IPython.display import display

In [None]:
%matplotlib inline
plt.rcParams['font.size'] = 20
plt.rcParams['figure.figsize'] = (50, 10)
plt.rcParams['lines.linewidth'] = 5
plt.rcParams['axes.grid'] = True

In [None]:
%%html
<style>
    table { display: inline-block}
    .rendered_html td, .rendered_html th { text-align: left;}
</style>    

In [None]:
# 2. 조건 및 파라미터 설정

dict_args = dict()

# 입력관련 설정
dict_args['base_file'] = '..\data\creaditcard.csv'

dict_args['id_col_name'] = 'Time'
dict_args['target_col_name'] = 'Class'

# 출력관련 설정
dict_args['train_file'] = 'train_source.csv'
dict_args['vaild_file'] = 'vaild_source.csv'
dict_args['min_max_file'] = 'refs/min_max_file.csv'
# 분할된 하나의 클래스에서 필요한 최소 건수
dict_args['value_min_rows_count_in_class'] = 3
# 샘플링으로 추출할 학습 비율(%)
dict_args['train_ratio'] = 80

dict_args['derived_1_file_source'] = dict_args['train_file']
dict_args['derived_1_file_output'] = 'work/derived_1_file_output.csv'
dict_args['derived_2_file_source'] = dict_args['derived_1_file_output']
dict_args['derived_2_file_output'] = 'work/derived_2_file_output.csv'
dict_args['derived_3_file_source'] = dict_args['derived_2_file_output']
dict_args['derived_output'] = 'work/derived_output.csv'

dict_args['information_value'] = 'work/reduce_information_value.csv'
dict_args['feature_importance'] = 'work/reduce_feature_importance.csv'

dict_args['cutoff_feature_importance'] = 0.01
dict_args['cutoff_information_value'] = 0.1

dict_args['source_data_file_train'] = 'train_source.csv'
dict_args['target_data_file_train'] = 'train_output.csv'
dict_args['source_data_file_valid'] = 'valid_source.csv'
dict_args['source_data_file_valid'] = 'valid_output.csv'

dict_args['model_file'] = 'zulu_trained_model_xgb.h5'

dict_args['source_file'] = 'source.csv'

if not os.path.exists('work'):
    os.mkdir('work')
if not os.path.exists('refs'):
    os.mkdir('refs')


In [None]:
# 3. 데이터 로딩 및 확인

df = pandas.read_csv(dict_args['base_file'])
display(df)

(2) 샘플링

In [None]:
# 1. 층화 추출 실행
warnings.filterwarnings('ignore')
split_file(dict_args)

In [None]:
# 2. 층화 추출 결과 확인

df_train = pandas.read_csv(dict_args['train_file'])
df_vaild = pandas.read_csv(dict_args['valid_file'])
df_min_max = pandas.read_csv(dict_args['min_max_file'])
display(df_train)
display(df_vaild)
display(df_min_max)
print('Train:', numpy.sum(df_train[dict_args['target_col_name']]), ', Valid:', numpy.sum(df_vaild[dict_args['target_col_name']]))

(3) 피처 엔지니어링

In [None]:
# 1. 수리적 파생변수 생성

variable_derive_math(dict_args)
variable_derive_group(dict_args)

In [None]:
# 1-1. 수리적 파생변수 생성 결과 확인

df_derived_1 = pandas.read_csv(dict_args['derived_1_file_output'])
df_derived_2 = pandas.read_csv(dict_args['derived_2_file_output'])
display(df_derived_1)
display(df_derived_2)
  
if dict_args['target_col_name'] in df_derived_1.columns:
    print('df_derived_1', dict_args['target_col_name'])
if dict_args['target_col_name'] in df_derived_2.columns:
    print('df_derived_2', dict_args['target_col_name'])    

In [None]:
# 2. 정규화 데이터 생성

variable_derive_normalization(dict_args)

In [None]:
# 2-1. 정규화 데이터 생성결과 확인

derived_3_file_source = pandas.read_csv(dict_args['derived_3_file_source'])
display(derived_3_file_source)
df_derived_3 = pandas.read_csv(dict_args['derived_output'])
display(df_derived_3)
# print(list(df_derived_3.columns))

In [None]:
# 3. 정보가치(Information Value)

warnings.filterwarnings('ignore')
reduce_information_value(dict_args)

In [None]:
# 3-1. 정보가치(Information Value) 생성 결과 확인

df_iv = pandas.read_csv(dict_args['information_value'])
display(df_iv)

In [None]:
# 3-2. 정보가치(Information Value) 생성 결과 반영(1차 축소)

df_iv = pandas.read_csv(dict_args['information_value'])
df_iv = df_iv[df_iv['IV_VALUE'] > dict_args['cutoff_information_value']]
list_reduceed_columns = [dict_args['id_col_name'], dict_args['target_col_name']]
list_reduceed_columns.extend(df_iv['VAR_NAME'].tolist())
df_derived_3 = pandas.read_csv(dict_args['derived_output'])
df_derived_3 = df_derived_3[list_reduceed_columns]
df_derived_3.to_csv('work/reduced.csv', index=None)
display(df_derived_3)

In [None]:
# 4. Feature Importance

dict_args['derived_output'] = 'work/'