# 김해시 화제 예측모델 개발
## 1. 데이터 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, preprocessing, svm
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
import math
import matplotlib
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings; warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df_train = pd.read_csv('./data/PJT002_train.csv')
#df_val = pd.read_csv('./data/PJT002_validation.csv')
df_test = pd.read_csv('./data/PJT002_test.csv')

In [3]:
df_train.rename(columns={'fr_yn':'fr_yn1'}, inplace=True)
df_train['fr_yn'] = df_train['fr_yn1']
df_train.drop('fr_yn1', axis=1, inplace=True)
df_train.head()

Unnamed: 0,dt_of_fr,bldng_us,bldng_archtctr,bldng_cnt,bldng_ar,ttl_ar,lnd_ar,dt_of_athrztn,ttl_grnd_flr,ttl_dwn_flr,...,bldng_cnt_in_50m,trgt_crtr,fr_fghtng_fclt_spcl_css_5_yn,fr_fghtng_fclt_spcl_css_6_yn,us_yn,dngrs_thng_yn,slf_fr_brgd_yn,blk_dngrs_thng_mnfctr_yn,cltrl_hrtg_yn,fr_yn
0,2017-10-20 05:54,단독주택,블록구조,3,69.42,69.42,0.0,1977.0,1.0,0.0,...,0,,,,,,,,,Y
1,2018-09-30 08:26,,,3,46.29,46.29,0.0,,1.0,0.0,...,0,,,,,,,,,N
2,2016-10-30 14:57,공동주택,철근콘크리트구조,1,583.8,2516.76,1446.0,20001100.0,5.0,0.0,...,14,,,,,,,,,Y
3,2016-06-14 05:23,단독주택,일반목구조,2,48.92,48.92,0.0,1936.0,1.0,0.0,...,11,,,,,,,,,N
4,2018-04-22 05:38,,,2,0.0,0.0,0.0,,,,...,0,,,,,,,,,N


In [4]:
df_test.head()

Unnamed: 0,dt_of_fr,bldng_us,bldng_archtctr,bldng_cnt,bldng_ar,ttl_ar,lnd_ar,dt_of_athrztn,ttl_grnd_flr,ttl_dwn_flr,...,bldng_cnt_in_50m,trgt_crtr,fr_fghtng_fclt_spcl_css_5_yn,fr_fghtng_fclt_spcl_css_6_yn,us_yn,dngrs_thng_yn,slf_fr_brgd_yn,blk_dngrs_thng_mnfctr_yn,cltrl_hrtg_yn,fr_yn
0,2016-02-03 15:28,,,3,0.0,0.0,0.0,,,,...,0,,,,,,,,,
1,2016-03-17 18:25,공장,일반철골구조,2,915.75,903.75,2660.0,20011228.0,1.0,0.0,...,8,,,,,,,,,
2,2018-12-03 06:51,,,5,5523.87,9888.87,13607.0,,9.0,0.0,...,1,,N,N,N,N,N,N,N,
3,2015-03-29 23:37,,,8,667.2,914.4,0.0,,3.0,0.0,...,0,,,,,,,,,
4,2016-05-17 11:08,,,7,1050.06,1050.33,0.0,,3.0,0.0,...,4,,,,,,,,,


In [5]:
df_train.shape

(59199, 180)

In [6]:
df_train.info(True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59199 entries, 0 to 59198
Data columns (total 180 columns):
dt_of_fr                        object
bldng_us                        object
bldng_archtctr                  object
bldng_cnt                       int64
bldng_ar                        float64
ttl_ar                          float64
lnd_ar                          float64
dt_of_athrztn                   object
ttl_grnd_flr                    float64
ttl_dwn_flr                     float64
bldng_us_clssfctn               object
tmprtr                          float64
prcpttn                         float64
wnd_spd                         float64
wnd_drctn                       float64
hmdt                            float64
gas_engry_us_201401             float64
ele_engry_us_201401             float64
gas_engry_us_201402             float64
ele_engry_us_201402             float64
gas_engry_us_201403             float64
ele_engry_us_201403             float64
gas_engry_us_2014

In [7]:
# y or n -> 1 or 0
binary_y = {'N':0, 'Y':1}

df_train['fr_yn'] = df_train['fr_yn'].map(binary_y)
df_train['mlt_us_yn'] = df_train['mlt_us_yn'].map(binary_y)
# df_val['fr_yn'] = df_val['fr_yn'].map(binary_y)
# df_val['mlt_us_yn'] = df_val['mlt_us_yn'].map(binary_y)
df_test['fr_yn'] = df_test['fr_yn'].map(binary_y)
df_test['mlt_us_yn'] = df_test['mlt_us_yn'].map(binary_y)

## 2. 데이터 가공

### 2.1. 불필요한 피처 제거

In [8]:
def remove_feat(df):
    idx = [1, 3, 6, 11, 12, 13, 14] + list(range(16, 152)) + [153, 157, 158, 161, 163, 164, 166, 168] + list(range(170, 179))
    df.drop(df.iloc[:, idx], axis=1, inplace=True)

In [9]:
remove_feat(df_train)
# remove_feat(df_val)
remove_feat(df_test)

df_train.shape[1], df_test.shape[1]

(20, 20)

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59199 entries, 0 to 59198
Data columns (total 20 columns):
dt_of_fr                59199 non-null object
bldng_archtctr          31534 non-null object
bldng_ar                59199 non-null float64
ttl_ar                  59199 non-null float64
dt_of_athrztn           31618 non-null object
ttl_grnd_flr            48989 non-null float64
ttl_dwn_flr             48194 non-null float64
bldng_us_clssfctn       29856 non-null object
hmdt                    59177 non-null float64
rgnl_ar_nm              57708 non-null object
lnd_us_sttn_nm          57423 non-null object
rd_sd_nm                57708 non-null object
emd_nm                  59195 non-null object
bldng_ar_prc            37304 non-null float64
fr_wthr_fclt_dstnc      59199 non-null int64
mlt_us_yn               59199 non-null int64
cctv_in_100m            59199 non-null int64
sft_emrgnc_bll_dstnc    59199 non-null int64
no_tbc_zn_dstnc         59199 non-null int64
fr_yn           

### 2.2. 데이터 전처리

#### 2.2.1. 전처리를 위한 함수 정의
##### 모든 피처에 사용되는 함수

In [11]:
# 빈 칸으로 두 개의 데이터로 분리
def split_dt1(data):
    try:
        return data.split(' ')
    except:
        return['a','a']

# 빈 칸으로 두 개의 데이터로 분리
def split_dt2(data):
    try:
        return data.split(' ')
    except:
        return['a','a','a']

# -로 3개의 데이터로 분리
def split_ds(data):
    try:
        return data.split('-')
    except:
        return['b','b','b']

# :로 2개의 데이터로 분리
def split_dy(data):
    try:
        return data.split(':')
    except:
        return['c','c']

In [12]:
# 비숫자형 데이터를 인코딩
def encode_value(df, col_name):
    encoder = LabelEncoder()
    encoder.fit(df[col_name])
    labels = encoder.transform(df[col_name])
    df[col_name] = labels
    
    return df

##### 개별 피처에 사용되는 함수

In [13]:
# bldng_archtctr 전처리 함수
def pre_bldng_archtctr(df):
    df['bldng_archtctr'].fillna(value='not-declared', inplace=True)
    encode_value(df,'bldng_archtctr')
    
    return df

In [14]:
# bldng_us_clssfctn 전처리 함수
def pre_bldng_us_clssfctn(df):
    df['bldng_us_clssfctn'].fillna(value='not-declared', inplace=True)
    encode_value(df,'bldng_us_clssfctn')
    
    return df

In [15]:
# rd_sd_nm 전처리 함수
def pre_rd_sd_nm(df):
    df['rd_sd_nm'].fillna('세로한면(가)', inplace=True)
    encode_value(df,'rd_sd_nm')
    return df

In [16]:
# fr_wthr_fclt_dstnc 전처리 함수
def pre_fr_wthr_fclt_dstnc(df):
    df['fr_wthr_fclt_dstnc']=(df['fr_wthr_fclt_dstnc']*0.1).round()
    return df

In [17]:
# dt_of_fr 전처리 함수
def pre_dt_of_fr(df):
    df['date'], df['time'] = zip(*df['dt_of_fr'].apply(lambda x:split_dt1(x)))
    df['year'], df['month'], df['day'] = zip(*df['date'].apply(lambda x:split_ds(x)))
    df['hour'], df['minute'] = zip(*df['time'].apply(lambda x:split_dy(x)))
    df.drop(['dt_of_fr','time','date','minute','year','day'], axis=1, inplace=True)
    
    return df

In [18]:
# hmdt 전처리 함수
def pre_hmdt(df):
    data=df
    data1=data[data['month'].isin(['01'])]
    data2=data[data['month'].isin(['02'])]
    data3=data[data['month'].isin(['03'])]
    data4=data[data['month'].isin(['04'])]
    data5=data[data['month'].isin(['05'])]
    data6=data[data['month'].isin(['06'])]
    data7=data[data['month'].isin(['07'])]
    data8=data[data['month'].isin(['08'])]
    data9=data[data['month'].isin(['09'])]
    data10=data[data['month'].isin(['10'])]
    data11=data[data['month'].isin(['11'])]
    data12=data[data['month'].isin(['12'])]
    data1['hmdt']=data1['hmdt'].fillna('54.28')
    data2['hmdt']=data2['hmdt'].fillna('51.23')
    data3['hmdt']=data3['hmdt'].fillna('58.02')
    data4['hmdt']=data4['hmdt'].fillna('61.95')
    data5['hmdt']=data5['hmdt'].fillna('61.74')
    data6['hmdt']=data6['hmdt'].fillna('71.57')
    data7['hmdt']=data7['hmdt'].fillna('80.00')
    data8['hmdt']=data8['hmdt'].fillna('77.88')
    data9['hmdt']=data9['hmdt'].fillna('77.61')
    data10['hmdt']=data10['hmdt'].fillna('72.29')
    data11['hmdt']=data11['hmdt'].fillna('68.90')
    data12['hmdt']=data12['hmdt'].fillna('57.36')
    data_fin=pd.concat([data1,data2,data3,data4,data5,data6,data7,data8,data9,data10,data11,data12])
    data_fin=data_fin.sort_index()
    data_fin[['hmdt','hour']] = data_fin[['hmdt','hour']].apply(pd.to_numeric)
    df[['hmdt','hour']]=data_fin[['hmdt','hour']]
    df.drop("month",axis=1,inplace=True)
    
    return df

In [19]:
# emd_nm 전처리 함수
def pre_emd_nm(df):
    df['emd_nm'].fillna(value="경상남도 창원시 의창구", inplace=True)
    df['a'], df['b'], df['c'] = zip(*df['emd_nm'].apply(lambda x:split_dt2(x)))
    df['emd_nm'] = df['a'] + df['b'] + df['c']
    df['emd_nm'] = df['emd_nm'].str[7:10]
    encode_value(df,'emd_nm')
    df.drop(['a','b','c'],axis=1,inplace=True)
    
    return df

In [20]:
# bldng_ar_prc 전처리 함수 
def pre_bldng_ar_prc(df):
    df1 = pd.DataFrame(columns=["emd_nm","bldng_ar_prc",'dt_of_athrztn'])
    df1.reset_index(inplace=True)
    df2 = df[['emd_nm',"bldng_ar_prc",'dt_of_athrztn']]

    for i in range(236):
        df3 = df2[df2['emd_nm'].isin([i])]
        avg = df3["bldng_ar_prc"].mean()
        df3['bldng_ar_prc'] = df3['bldng_ar_prc'].fillna(avg)
        df1 = df1.append(df3)
        
    avg = df2["bldng_ar_prc"].mean()
    df1['bldng_ar_prc'].fillna(value=avg, inplace=True)
    df1 = df1.sort_index()
    df["bldng_ar_prc"] = df1["bldng_ar_prc"]
    df["bldng_ar_prc"] = (df["bldng_ar_prc"]*0.001).round().astype(int)
    
    return df

In [21]:
# dt_of_arthrztn 전처리 함수1 : outlier 제거 및 null값 처리 및 건물 나이 계산
def pre_dt_of_athrztn(df):
    df['dt_of_athrztn'] = df['dt_of_athrztn'].fillna('1')
    df['dt_of_athrztn'] = df['dt_of_athrztn'].astype('str')
    df['dt_of_athrztn'] = df['dt_of_athrztn'].str[0:4]
    df[df['dt_of_athrztn'] > '2019']= 1
    df['dt_of_athrztn'] = 2019 - df['dt_of_athrztn'].astype('int')
    
    return df

In [22]:
# dt_of_arthrztn 전처리 함수2 : 건물 나이 계산
def pre2_dt_of_athrztn(df):
    df2 = pd.DataFrame(columns=["emd_nm","bldng_ar_prc",'dt_of_athrztn'])
    df2.reset_index(inplace=True)
    df3 = df[df['dt_of_athrztn'] != 2018]
    
    for i in range(236):
        df4 = df[df['emd_nm'].isin([i])]
        df5 = df3[df3['emd_nm'].isin([i])]
        avg = df5["dt_of_athrztn"].mean()
        df4[df4['dt_of_athrztn'] == 2018] = avg
        df2 = df2.append(df4)
        
    avg = df3["dt_of_athrztn"].mean()
    df2[df2['dt_of_athrztn'] == 2018] = avg
    df2 = df2.sort_values(by='index')
    df["dt_of_athrztn"] = df2["dt_of_athrztn"].round().astype('int')
    
    return df

In [23]:
# rgnl_ar_nm 전처리 함수
# null값을 '필드값없음'으로 만든 버전
def get_group_rgnl1(data): 
    cat=''
    if (data == "제1종일반주거지역") | (data == "제2종일반주거지역") | (data == "제3종일반주거지역")\
        | (data == "제1종전용주거지역") | (data == "제2종전용주거지역") | (data == "준주거지역"):
        cat='주거지역'
    elif (data == "계획관리지역") | (data == "관리지역") | (data == "보전관리지역") | (data == "생산관리지역"):
        cat='관리지역'
    elif (data == "보전녹지지역") | (data == "생산녹지지역") | (data == "자연녹지지역"):
        cat='녹지지역'
    elif (data == "근린상업지역") | (data == "유통상업지역") | (data == "일반상업지역") | (data == "중심상업지역"):
        cat='상업지역'
    elif (data == "일반공업지역") | (data == "준공업지역"):
        cat='공업지역'
    elif (data == "개발제한구역"):
        cat='개발제한구역'
    elif (data == "농림지역"):
        cat='농림지역'
    elif (data == "용도미지정"):
        cat='용도미지정'
    elif (data == "자연환경보전지역"):
        cat='자연환경보전지역'
    else:
        cat='필드값없음'
    
    return cat

# null값을 '관리지역'으로 만든버전
def get_group_rgnl2(data): 
    cat=''
    if (data == "제1종일반주거지역") | (data == "제2종일반주거지역") | (data == "제3종일반주거지역")\
        | (data == "제1종전용주거지역") | (data == "제2종전용주거지역") | (data == "준주거지역"):
            cat='주거지역'
    elif (data == "계획관리지역") | (data == "관리지역") | (data == "보전관리지역") | (data == "생산관리지역"):
        cat='관리지역'
    elif (data == "보전녹지지역") | (data == "생산녹지지역") | (data == "자연녹지지역"):
        cat='녹지지역'
    elif (data == "근린상업지역") | (data == "유통상업지역") | (data == "일반상업지역") | (data == "중심상업지역"):
        cat='상업지역'
    elif (data == "일반공업지역") | (data == "준공업지역"):
        cat='공업지역'
    elif (data == "개발제한구역"):
        cat='개발제한구역'
    elif (data == "농림지역"):
        cat='농림지역'
    elif (data == "용도미지정"):
        cat='용도미지정'
    elif (data == "자연환경보전지역"):
        cat='자연환경보전지역'
    else:
        cat='관리지역'
    
    return cat

def pre_rgnl_ar_nm(df):
    df['rgnl_ar_nm_cat'] = df['rgnl_ar_nm'].apply(get_group_rgnl1)
    df['rgnl_ar_nm_cat2'] = df['rgnl_ar_nm'].apply(get_group_rgnl2)
    encode_value(df,'rgnl_ar_nm_cat')
    encode_value(df, 'rgnl_ar_nm_cat2')
    df.drop('rgnl_ar_nm', axis=1, inplace=True)
    
    return df

In [24]:
# lnd_us_sttn_nm 전처리 함수
# null값을 '필드값없음'으로 만든 버전
def get_group_lnd1(data):
    cat=''
    if (data == "고속도로휴게소") | (data == "공원등") | (data == "도로등") | (data == "여객자동차터미널")\
        | (data == "운동장등") | (data == "위험시설") | (data == "유해.혐오시설") | (data == "주차장등")\
        | (data == "하천등"):
        cat='공공용지'
    elif (data == "골프장 대중제") | (data == "골프장 회원제") | (data == "공원묘지") | (data == "스키장")\
        | (data == "유원지") | (data == "콘도미니엄") | (data == "특수기타"):
        cat='특수토지'
    elif (data == "공업기타") | (data == "공업나지") | (data == "공업용") | (data == "발전소"):
        cat='공업용'
    elif (data == "과수원") | (data == "전") | (data == "전기타"):
        cat='전'
    elif (data == "기타"):
        cat='기타'
    elif (data == "다세대") | (data == "단독") | (data == "아파트") | (data == "연립") | (data == "주거기타")\
        | (data == "주거나지"):
        cat='주거용'
    elif (data == "답") | (data == "답기타"):
        cat='답'
    elif (data == "목장용지") | (data == "임야기타") | (data == "자연림") | (data == "조림")\
        | (data == "토지임야"):
        cat='임야'
    elif (data == "상업기타") | (data == "상업나지") | (data == "상업용") | (data == "업무용"):
        cat='상업업무용'
    elif (data == "주상기타") | (data == "주상나지") | (data == "주상용"):
        cat='주상복합용'
    else:
        cat='필드값없음'
    
    return cat

# null값 '주거용'으로 만든 버전
def get_group_lnd2(data): 
    cat=''
    if (data == "고속도로휴게소") | (data == "공원등") | (data == "도로등") | (data == "여객자동차터미널")\
        | (data == "운동장등") | (data == "위험시설") | (data == "유해.혐오시설") | (data == "주차장등")\
        | (data == "하천등"):
        cat='공공용지'
    elif (data == "골프장 대중제") | (data == "골프장 회원제") | (data == "공원묘지") | (data == "스키장")\
        | (data == "유원지") | (data == "콘도미니엄") | (data == "특수기타"):
        cat='특수토지'
    elif (data == "공업기타") | (data == "공업나지") | (data == "공업용") | (data == "발전소"):
        cat='공업용'
    elif (data == "과수원") | (data == "전") | (data == "전기타"):
        cat='전'
    elif (data == "기타"):
        cat='기타'
    elif (data == "다세대") | (data == "단독") | (data == "아파트") | (data == "연립")\
        | (data == "주거기타") | (data == "주거나지"):
        cat='주거용'
    elif (data == "답") | (data == "답기타"):
        cat='답'
    elif (data == "목장용지") | (data == "임야기타") | (data == "자연림") | (data == "조림")\
        | (data == "토지임야"):
        cat='임야'
    elif (data == "상업기타") | (data == "상업나지") | (data == "상업용") | (data == "업무용"):
        cat='상업업무용'
    elif (data == "주상기타") | (data == "주상나지") | (data == "주상용"):
        cat='주상복합용'
    else:
        cat='주거용'
    
    return cat

def pre_lnd_us_sttn(df):
    df['lnd_us_sttn_cat'] = df['lnd_us_sttn_nm'].apply(get_group_lnd1)
    df['lnd_us_sttn_cat2'] = df['lnd_us_sttn_nm'].apply(get_group_lnd2)
    encode_value(df,'lnd_us_sttn_cat')
    encode_value(df,'lnd_us_sttn_cat2')
    df.drop('lnd_us_sttn_nm', axis=1, inplace=True)
    
    return df

In [25]:
# ttl_ar 전처리 함수
def pre_ttl_ar(df):
    df['log_ttl_ar'] = df['ttl_ar'].apply(lambda x:int(np.log(x+1)))
    df.drop('ttl_ar', axis=1, inplace=True)
    
    df.loc[((df['log_ttl_ar']>=6) & (df['log_ttl_ar']<=8)), 'log_ttl_ar'] = 6
    df.loc[df['log_ttl_ar']>8, 'log_ttl_ar'] = 7
    
    return df

In [26]:
# bldng_ar 전처리 함수
def pre_bldng_ar(df):
    df['log_bldng_ar'] = df['bldng_ar'].apply(lambda x:int(np.log(x+1)))
    df.drop('bldng_ar', axis=1, inplace=True)
    
    df.loc[((df['log_bldng_ar']==5) | (df['log_bldng_ar']==6)), 'log_bldng_ar'] = 5
    df.loc[df['log_bldng_ar']==7, 'log_bldng_ar'] = 6
    df.loc[df['log_bldng_ar']>=8, 'log_bldng_ar'] = 7
    
    return df

In [27]:
# ttl_grnd_flr 전처리 함수
def pre_ttl_grnd_flr(df):
    df.loc[((df['ttl_grnd_flr']>=6) & (df['ttl_grnd_flr']<30)), 'ttl_grnd_flr'] = 6
    df.loc[df['ttl_grnd_flr']>=30, 'ttl_grnd_flr'] = 30
    df['ttl_grnd_flr'].fillna(1, inplace=True)
    
    return df

In [28]:
# ttl_dwn_flr 전처리 함수
def pre_ttl_dwn_flr(df):
    df.loc[df['ttl_dwn_flr']>=10,'ttl_dwn_flr'] = 10
    df['ttl_dwn_flr'].fillna(0, inplace=True)
    
    return df

In [29]:
# sft_emrgnc_bll_dstnc 전처리 함수
def get_group_sft_bll(data):
    if data < 50:
        return 0
    elif data < 100:
        return 50
    elif data < 150:
        return 100
    elif data < 200:
        return 150
    elif data < 250:
        return 200
    elif data < 300:
        return 250
    else:
        return 300

def pre_sft_emrgnc_bll_dstnc(df):
    df['sqrt_sft_emrgnc_bll_dstnc'] = df['sft_emrgnc_bll_dstnc'].apply(lambda x:np.sqrt(x))
    df['sqrt_sft_emrgnc_bll_dstnc'] = df['sqrt_sft_emrgnc_bll_dstnc'].apply(get_group_sft_bll)
    df.drop('sft_emrgnc_bll_dstnc', axis=1, inplace=True)
    
    return df

In [30]:
# no_tbc_zn_dstnc 전처리 함수
def get_group_no_tbc(data):
    if data < 1:
        return 0
    elif data < 3:
        return 1
    elif data < 5:
        return 3
    elif data < 7:
        return 5
    elif data < 9:
        return 7
    else:
        return 9

def pre_no_tbc_zn_dstnc(df):
    df['log_no_tbc_zn_dstnc'] = df['no_tbc_zn_dstnc'].apply(lambda x:np.log(x+1))
    df['log_no_tbc_zn_dstnc'] = df['log_no_tbc_zn_dstnc'].apply(get_group_no_tbc)
    df.drop('no_tbc_zn_dstnc', axis=1, inplace=True)
    
    return df 

#### 2.2.2. 전처리 통합 함수 정의

In [31]:
def preprocessing(df):
    pre_bldng_archtctr(df)
    pre_bldng_us_clssfctn(df)
    pre_rd_sd_nm(df)
    pre_fr_wthr_fclt_dstnc(df)
    pre_dt_of_fr(df)
    pre_hmdt(df)
    pre_emd_nm(df)
    pre_bldng_ar_prc(df)
    pre_dt_of_athrztn(df)
    pre2_dt_of_athrztn(df)
    pre_rgnl_ar_nm(df)
    pre_lnd_us_sttn(df)
    pre_ttl_ar(df)
    pre_bldng_ar(df)
    pre_ttl_grnd_flr(df)
    pre_ttl_dwn_flr(df)
    pre_sft_emrgnc_bll_dstnc(df)
    pre_no_tbc_zn_dstnc(df)
    
    return df

### 2.3. 함수 적용

In [32]:
preprocessing(df_train)
preprocessing(df_test)

Unnamed: 0,bldng_archtctr,dt_of_athrztn,ttl_grnd_flr,ttl_dwn_flr,bldng_us_clssfctn,hmdt,rd_sd_nm,emd_nm,bldng_ar_prc,fr_wthr_fclt_dstnc,...,fr_yn,hour,rgnl_ar_nm_cat,rgnl_ar_nm_cat2,lnd_us_sttn_cat,lnd_us_sttn_cat2,log_ttl_ar,log_bldng_ar,sqrt_sft_emrgnc_bll_dstnc,log_no_tbc_zn_dstnc
0,0,21,1.0,0.0,0,20.0,6,1,1078,11.0,...,,15,7,2,9,6,0,0,100,7
1,12,18,1.0,0.0,2,62.0,9,8,851,29.0,...,,18,2,2,1,1,6,5,50,7
2,0,27,6.0,0.0,0,97.0,6,14,953,5.0,...,,6,1,1,1,1,7,7,0,5
3,0,27,3.0,0.0,0,63.0,6,14,953,25.0,...,,23,1,1,1,1,6,5,50,7
4,0,28,3.0,0.0,0,28.0,6,16,344,9.0,...,,11,2,2,5,5,6,5,50,7
5,16,19,2.0,0.0,7,66.0,9,4,1275,25.0,...,,17,6,6,6,6,5,4,0,5
6,16,27,4.0,0.0,7,93.0,4,0,723,0.0,...,,4,6,6,6,6,6,4,0,3
7,11,89,1.0,0.0,7,35.0,6,13,816,21.0,...,,13,6,6,6,6,4,4,0,5
8,8,18,1.0,0.0,7,72.0,7,1,709,12.0,...,,18,6,6,6,6,4,4,50,7
9,16,24,2.0,0.0,6,51.0,6,15,891,13.0,...,,15,6,6,6,6,5,5,0,5


In [33]:
df_train.head()

Unnamed: 0,bldng_archtctr,dt_of_athrztn,ttl_grnd_flr,ttl_dwn_flr,bldng_us_clssfctn,hmdt,rd_sd_nm,emd_nm,bldng_ar_prc,fr_wthr_fclt_dstnc,...,fr_yn,hour,rgnl_ar_nm_cat,rgnl_ar_nm_cat2,lnd_us_sttn_cat,lnd_us_sttn_cat2,log_ttl_ar,log_bldng_ar,sqrt_sft_emrgnc_bll_dstnc,log_no_tbc_zn_dstnc
0,9,42,1.0,0.0,7,96.0,7,201,580,13.0,...,1,5,3,3,7,7,4,4,150,3
1,0,18,1.0,0.0,0,74.0,6,1,123,49.0,...,0,8,2,2,7,7,3,3,100,7
2,16,19,5.0,0.0,7,21.0,11,201,618,14.0,...,1,14,8,8,7,7,6,5,150,3
3,11,83,1.0,0.0,7,91.0,7,174,720,158.0,...,0,5,8,8,7,7,3,3,50,5
4,0,38,1.0,0.0,0,89.0,7,201,580,60.0,...,0,5,7,7,5,5,0,0,150,5


In [34]:
df_test.head()

Unnamed: 0,bldng_archtctr,dt_of_athrztn,ttl_grnd_flr,ttl_dwn_flr,bldng_us_clssfctn,hmdt,rd_sd_nm,emd_nm,bldng_ar_prc,fr_wthr_fclt_dstnc,...,fr_yn,hour,rgnl_ar_nm_cat,rgnl_ar_nm_cat2,lnd_us_sttn_cat,lnd_us_sttn_cat2,log_ttl_ar,log_bldng_ar,sqrt_sft_emrgnc_bll_dstnc,log_no_tbc_zn_dstnc
0,0,21,1.0,0.0,0,20.0,6,1,1078,11.0,...,,15,7,2,9,6,0,0,100,7
1,12,18,1.0,0.0,2,62.0,9,8,851,29.0,...,,18,2,2,1,1,6,5,50,7
2,0,27,6.0,0.0,0,97.0,6,14,953,5.0,...,,6,1,1,1,1,7,7,0,5
3,0,27,3.0,0.0,0,63.0,6,14,953,25.0,...,,23,1,1,1,1,6,5,50,7
4,0,28,3.0,0.0,0,28.0,6,16,344,9.0,...,,11,2,2,5,5,6,5,50,7


In [35]:
df_train.isna().sum()

bldng_archtctr               0
dt_of_athrztn                0
ttl_grnd_flr                 0
ttl_dwn_flr                  0
bldng_us_clssfctn            0
hmdt                         0
rd_sd_nm                     0
emd_nm                       0
bldng_ar_prc                 0
fr_wthr_fclt_dstnc           0
mlt_us_yn                    0
cctv_in_100m                 0
fr_yn                        0
hour                         0
rgnl_ar_nm_cat               0
rgnl_ar_nm_cat2              0
lnd_us_sttn_cat              0
lnd_us_sttn_cat2             0
log_ttl_ar                   0
log_bldng_ar                 0
sqrt_sft_emrgnc_bll_dstnc    0
log_no_tbc_zn_dstnc          0
dtype: int64

In [36]:
df_test.isna().sum()

bldng_archtctr                  0
dt_of_athrztn                   0
ttl_grnd_flr                    0
ttl_dwn_flr                     0
bldng_us_clssfctn               0
hmdt                            0
rd_sd_nm                        0
emd_nm                          0
bldng_ar_prc                    0
fr_wthr_fclt_dstnc              0
mlt_us_yn                       0
cctv_in_100m                    0
fr_yn                        2957
hour                            0
rgnl_ar_nm_cat                  0
rgnl_ar_nm_cat2                 0
lnd_us_sttn_cat                 0
lnd_us_sttn_cat2                0
log_ttl_ar                      0
log_bldng_ar                    0
sqrt_sft_emrgnc_bll_dstnc       0
log_no_tbc_zn_dstnc             0
dtype: int64

In [37]:
df_train.info(True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59199 entries, 0 to 59198
Data columns (total 22 columns):
bldng_archtctr               59199 non-null int32
dt_of_athrztn                59199 non-null int32
ttl_grnd_flr                 59199 non-null float64
ttl_dwn_flr                  59199 non-null float64
bldng_us_clssfctn            59199 non-null int32
hmdt                         59199 non-null float64
rd_sd_nm                     59199 non-null int32
emd_nm                       59199 non-null int32
bldng_ar_prc                 59199 non-null int32
fr_wthr_fclt_dstnc           59199 non-null float64
mlt_us_yn                    59199 non-null int64
cctv_in_100m                 59199 non-null int64
fr_yn                        59199 non-null int64
hour                         59199 non-null int64
rgnl_ar_nm_cat               59199 non-null int32
rgnl_ar_nm_cat2              59199 non-null int32
lnd_us_sttn_cat              59199 non-null int32
lnd_us_sttn_cat2             5919

In [38]:
df_test.isna().sum()

bldng_archtctr                  0
dt_of_athrztn                   0
ttl_grnd_flr                    0
ttl_dwn_flr                     0
bldng_us_clssfctn               0
hmdt                            0
rd_sd_nm                        0
emd_nm                          0
bldng_ar_prc                    0
fr_wthr_fclt_dstnc              0
mlt_us_yn                       0
cctv_in_100m                    0
fr_yn                        2957
hour                            0
rgnl_ar_nm_cat                  0
rgnl_ar_nm_cat2                 0
lnd_us_sttn_cat                 0
lnd_us_sttn_cat2                0
log_ttl_ar                      0
log_bldng_ar                    0
sqrt_sft_emrgnc_bll_dstnc       0
log_no_tbc_zn_dstnc             0
dtype: int64

## 3. 모델 학습

In [39]:
from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix, f1_score

# 평가 지표를 계산하는 함수
def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1=f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f} F1:{3:.4f}'.format(accuracy , precision ,recall,f1))

In [40]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

target = df_train['fr_yn']
df_train = df_train.drop(['fr_yn'], axis=1, inplace=False)

smote = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(df_train, target, test_size=0.16, random_state=156)
X_train_over, y_train_over = smote.fit_sample(X_train, y_train)
lgbm_wrapper = LGBMClassifier(n_estimators=500)
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train_over, y_train_over, early_stopping_rounds=100, eval_metric='logloss', eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
get_clf_eval(y_test, preds)

[1]	valid_0's binary_logloss: 0.636475
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.589589
[3]	valid_0's binary_logloss: 0.54976
[4]	valid_0's binary_logloss: 0.516224
[5]	valid_0's binary_logloss: 0.487948
[6]	valid_0's binary_logloss: 0.462606
[7]	valid_0's binary_logloss: 0.441176
[8]	valid_0's binary_logloss: 0.422163
[9]	valid_0's binary_logloss: 0.404677
[10]	valid_0's binary_logloss: 0.389113
[11]	valid_0's binary_logloss: 0.375673
[12]	valid_0's binary_logloss: 0.363443
[13]	valid_0's binary_logloss: 0.352765
[14]	valid_0's binary_logloss: 0.343021
[15]	valid_0's binary_logloss: 0.334884
[16]	valid_0's binary_logloss: 0.326816
[17]	valid_0's binary_logloss: 0.3196
[18]	valid_0's binary_logloss: 0.312789
[19]	valid_0's binary_logloss: 0.305475
[20]	valid_0's binary_logloss: 0.29988
[21]	valid_0's binary_logloss: 0.294256
[22]	valid_0's binary_logloss: 0.290269
[23]	valid_0's binary_logloss: 0.285424
[24]	valid_0's binary_logloss: 

[204]	valid_0's binary_logloss: 0.232806
[205]	valid_0's binary_logloss: 0.232874
[206]	valid_0's binary_logloss: 0.232879
[207]	valid_0's binary_logloss: 0.232856
[208]	valid_0's binary_logloss: 0.232861
[209]	valid_0's binary_logloss: 0.232797
[210]	valid_0's binary_logloss: 0.232864
[211]	valid_0's binary_logloss: 0.232871
[212]	valid_0's binary_logloss: 0.232878
[213]	valid_0's binary_logloss: 0.23281
[214]	valid_0's binary_logloss: 0.2328
[215]	valid_0's binary_logloss: 0.232819
[216]	valid_0's binary_logloss: 0.232837
[217]	valid_0's binary_logloss: 0.232851
[218]	valid_0's binary_logloss: 0.232824
[219]	valid_0's binary_logloss: 0.232898
[220]	valid_0's binary_logloss: 0.232909
[221]	valid_0's binary_logloss: 0.232861
[222]	valid_0's binary_logloss: 0.232942
[223]	valid_0's binary_logloss: 0.232952
[224]	valid_0's binary_logloss: 0.232927
[225]	valid_0's binary_logloss: 0.232917
[226]	valid_0's binary_logloss: 0.233049
[227]	valid_0's binary_logloss: 0.233035
[228]	valid_0's bin

## 4. 예측

In [43]:
df_test = df_test.drop(['fr_yn'], axis=1, inplace=False)

preds = lgbm_wrapper.predict(df_test)
preds

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

## 5. 제출

In [47]:
df_pred = pd.DataFrame(preds, columns=['fr_yn'])

binary_str = {0:'N', 1:'Y'}
df_pred['fr_yn'] = df_pred['fr_yn'].map(binary_str)

df_pred

Unnamed: 0,fr_yn
0,N
1,Y
2,N
3,N
4,N
5,Y
6,N
7,Y
8,Y
9,Y


In [48]:
df_pred.to_csv('./data/PJT002_submission.csv', mode='w')