In [1]:
import numpy as np
import pandas as pd
import keras
import warnings
warnings.filterwarnings(action='ignore')
import sys

from keras.models import Sequential
from keras.layers import Dense,Dropout
from sklearn.preprocessing import LabelEncoder

### Data 불러오기

In [2]:
data = pd.read_csv('data_preprocessing.csv')
data

Unnamed: 0,year,month,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,SEX,region_diff,AGE,CSTMR_CNT,CNT,AMT
0,2019,1,강원,강릉시,건강보조식품 소매업,1,1,20,4,4,311200
1,2019,1,강원,강릉시,건강보조식품 소매업,1,1,30,7,8,1374500
2,2019,1,강원,강릉시,건강보조식품 소매업,1,1,40,7,8,2764300
3,2019,1,강원,강릉시,건강보조식품 소매업,1,1,50,21,21,3690830
4,2019,1,강원,강릉시,건강보조식품 소매업,1,1,60,19,20,1433500
...,...,...,...,...,...,...,...,...,...,...,...
2691680,2020,3,충북,충주시,휴양콘도 운영업,1,1,30,3,4,43300
2691681,2020,3,충북,충주시,휴양콘도 운영업,1,1,40,3,3,35000
2691682,2020,3,충북,충주시,휴양콘도 운영업,1,1,50,4,6,188000
2691683,2020,3,충북,충주시,휴양콘도 운영업,1,1,60,3,3,194000


### Train구분

In [3]:
Train_data = data.loc[(data['year']==2020)&((data['month']==1)|(data['month']==2)|
                                              (data['month']==3))]
Train_data_X = Train_data.drop(['AMT','year','CNT','CSTMR_CNT'],axis=1)
Train_data = Train_data.drop(['CNT','year','CSTMR_CNT'],axis=1)
print('Train_data :',Train_data.shape)
print('Train_data_X :',Train_data_X.shape)

Train_data : (521750, 8)
Train_data_X : (521750, 7)


### SET Pred_Dataframe

In [4]:
def Pred_data_def(raw_data):
    temp         = []
    SEX          = raw_data['SEX'].unique()
    Region_diff  = raw_data['region_diff'].unique()
    AGE          = raw_data['AGE'].unique()
    month        = [4,7]
    CARD_SIDO_NM = raw_data['CARD_SIDO_NM'].unique()
    STD_CLSS_NM  = raw_data['STD_CLSS_NM'].unique()
    
    for sido in CARD_SIDO_NM:
        df = raw_data[raw_data["CARD_SIDO_NM"] == sido]
        ccg_list = df['CARD_CCG_NM'].unique()
        for ccg in ccg_list:
            for std in STD_CLSS_NM:
                for age in AGE: 
                    for sex in SEX:
                        for region in Region_diff:
                            for i in month:
                                temp.append([i, sido, ccg, std, sex, region, age])

    temp = np.array(temp)
    Pred_data = pd.DataFrame(data= temp, columns= ['month', 'CARD_SIDO_NM', 'CARD_CCG_NM',
                                                   'STD_CLSS_NM', 'SEX', 'region_diff', 'AGE'])
    return Pred_data

### Pred_Data

In [5]:
Pred_data = Pred_data_def(Train_data_X)
print('Pred_data :',Pred_data.shape)

Pred_data : (861000, 7)


### Type Change

In [6]:
Pred_data['month'] = Pred_data['month'].astype('int64')
Pred_data['SEX'] = Pred_data['SEX'].astype('int64')
Pred_data['region_diff'] = Pred_data['region_diff'].astype('int64')
Pred_data['AGE'] = Pred_data['AGE'].astype('int64')

### Train_data Encoding

In [7]:
dtypes = Train_data.dtypes
encoders = {}
for column in Train_data.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(Train_data[column])
        encoders[column] = encoder
        
df_num = Train_data.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(Train_data[column])
df_num

Unnamed: 0,month,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,SEX,region_diff,AGE,AMT
2169935,1,0,3,0,1,1,30,1903450
2169936,1,0,3,0,1,1,40,1845900
2169937,1,0,3,0,1,1,50,6132260
2169938,1,0,3,0,1,1,60,3162700
2169939,1,0,3,0,1,1,70,554000
...,...,...,...,...,...,...,...,...
2691680,3,16,202,40,1,1,30,43300
2691681,3,16,202,40,1,1,40,35000
2691682,3,16,202,40,1,1,50,188000
2691683,3,16,202,40,1,1,60,194000


### Pred_data Encoding

In [8]:
ptypes = Pred_data.dtypes
encoders = {}
for column in Pred_data.columns:
    if str(ptypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(Pred_data[column])
        encoders[column] = encoder
        
p_num = Pred_data.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    p_num[column] = encoder.transform(Pred_data[column])
p_num

Unnamed: 0,month,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,SEX,region_diff,AGE
0,4,0,3,0,1,1,30
1,7,0,3,0,1,1,30
2,4,0,3,0,1,2,30
3,7,0,3,0,1,2,30
4,4,0,3,0,1,3,30
...,...,...,...,...,...,...,...
860995,7,16,202,30,2,1,10
860996,4,16,202,30,2,2,10
860997,7,16,202,30,2,2,10
860998,4,16,202,30,2,3,10


### Model

In [None]:
y_train = df_num['AMT'].to_numpy(dtype=np.float32).reshape(-1,1)
X_train = df_num.drop(['AMT'],axis=1).to_numpy(dtype=np.float32)
X_test = p_num.to_numpy(dtype=np.float32)        

num_hidden_units = 150
input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(num_hidden_units,input_dim = input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_hidden_units,kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_hidden_units,kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_hidden_units,kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_hidden_units, kernel_initializer='normal',activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_hidden_units,kernel_initializer='normal', activation='relu'))
model.add(Dense(1))
model.compile(loss='MeanAbsolutePercentageError', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=555, batch_size=100, verbose=1)

Epoch 1/555
Epoch 2/555
Epoch 3/555
Epoch 4/555
Epoch 5/555
Epoch 6/555
Epoch 7/555
Epoch 8/555
Epoch 9/555
Epoch 10/555
Epoch 11/555
Epoch 12/555
Epoch 13/555
Epoch 14/555
Epoch 15/555
Epoch 16/555
Epoch 17/555
Epoch 18/555
Epoch 19/555
Epoch 20/555
Epoch 21/555
Epoch 22/555
Epoch 23/555
Epoch 24/555
Epoch 25/555
Epoch 26/555
Epoch 27/555
Epoch 28/555
Epoch 29/555
Epoch 30/555
Epoch 31/555
Epoch 32/555
Epoch 33/555
Epoch 34/555
Epoch 35/555
Epoch 36/555
Epoch 37/555
Epoch 38/555
Epoch 39/555
Epoch 40/555
Epoch 41/555
Epoch 42/555
Epoch 43/555
Epoch 44/555
Epoch 45/555
Epoch 46/555
Epoch 47/555
Epoch 48/555
Epoch 49/555
Epoch 50/555
Epoch 51/555
Epoch 52/555
Epoch 53/555
Epoch 54/555
Epoch 55/555
Epoch 56/555
Epoch 57/555
Epoch 58/555
Epoch 59/555
Epoch 60/555
Epoch 61/555
Epoch 62/555
Epoch 63/555
Epoch 64/555
Epoch 65/555
Epoch 66/555
Epoch 67/555
Epoch 68/555
Epoch 69/555
Epoch 70/555
Epoch 71/555
Epoch 72/555
Epoch 73/555
Epoch 74/555
Epoch 75/555
Epoch 76/555
Epoch 77/555
Epoch 78

Epoch 150/555
Epoch 151/555
Epoch 152/555
Epoch 153/555
Epoch 154/555
Epoch 155/555
Epoch 156/555
Epoch 157/555
Epoch 158/555
Epoch 159/555
Epoch 160/555
Epoch 161/555
Epoch 162/555
Epoch 163/555
Epoch 164/555
Epoch 165/555
Epoch 166/555
Epoch 167/555
Epoch 168/555

### Predict

In [None]:
pred = model.predict(X_test)
Pred_data['AMT'] = pred
Pred_data['REG_YYMM'] = ('20200' + Pred_data['month'].astype('str')).astype('int64')
Pred_data = Pred_data.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False) 
Pred_data.drop(['month','SEX','region_diff','AGE'],axis=1,inplace=True)
Pred_data

### Save to csv

In [None]:
submission = pd.read_csv('submission_mingsu.csv',index_col = 0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(Pred_data, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('submission_mingsu.csv', encoding='utf-8-sig')
submission.head(10)