In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

!pip install bayesian-optimization

from sklearn.model_selection import KFold,StratifiedKFold   # K-fold CV 
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

import warnings
warnings.filterwarnings(action = 'ignore')

  import pandas.util.testing as tm


Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=f9b45a5710470d61d477dd518a807aca1b6e907e720b4e1c7bca7588280efd98
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [2]:
data=pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/제주/201901-202003.csv')
sub = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/제주/submission.csv')

# Base Line 1
- Dacon Base Line + LightGBM 활용

In [None]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [None]:
# 날짜 처리
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [None]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [None]:
df

In [None]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [None]:
df_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2019,1,4,311200,4
1,0,0,0,1,1,1,2019,2,3,605000,3
2,0,0,0,1,1,1,2019,6,3,139000,3
3,0,0,0,1,1,1,2019,8,3,27500,3
4,0,0,0,1,1,1,2019,9,3,395500,3
...,...,...,...,...,...,...,...,...,...,...,...
1057389,16,40,16,6,2,5,2019,3,3,148000,4
1057390,16,40,16,6,2,5,2019,5,5,329800,7
1057391,16,40,16,6,2,5,2019,10,7,557800,7
1057392,16,40,16,6,2,5,2019,12,3,247800,3


In [None]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(train_num['AMT'])

In [None]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha,reg_lambda, x_data=None, y_data=None, n_splits=5, output='model'):
    score = 0
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=201)
    models = []
    for train_index, valid_index in skf.split(x_data,x_data['month']):
       #print(train_index)
       x_train, y_train = x_data.iloc[train_index], y_data.iloc[train_index]
       x_valid, y_valid = x_data.iloc[valid_index], y_data.iloc[valid_index]
        
       model = lgb.LGBMRegressor(
               
           num_leaves = int(num_leaves), 
           learning_rate = learning_rate, 
           n_estimators = int(n_estimators), 
           subsample = np.clip(subsample, 0, 1), 
           colsample_bytree = np.clip(colsample_bytree, 0, 1), 
           reg_alpha = reg_alpha, 
           reg_lambda = reg_lambda,
           )
       
       model.fit(x_train, y_train)
       models.append(model)
        
       pred = model.predict(x_valid)
       true = y_valid
       RMSE = (mean_squared_error(true, pred)**0.5)*(-1)
       score += RMSE/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
func_fixed = partial(lgb_cv, x_data=train_features, y_data=train_target, n_splits=5, output='score') 

lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (4, 16),
        'learning_rate': (0.0001, 0.1),
        'n_estimators': (4, 16),
        'subsample': (0.01, 1),
        'colsample_bytree': (0.01, 1),
        'reg_alpha': (0.01, 10), 
        'reg_lambda': (0.01, 50),
    }, 
    random_state=4321
)
lgbBO.maximize(init_points=15, n_iter=5) #30

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.484   [0m | [0m 0.08009 [0m | [0m 0.08152 [0m | [0m 13.21   [0m | [0m 7.436   [0m | [0m 1.939   [0m | [0m 48.95   [0m | [0m 0.4122  [0m |
| [0m 2       [0m | [0m-2.498   [0m | [0m 0.7602  [0m | [0m 0.009006[0m | [0m 7.719   [0m | [0m 11.43   [0m | [0m 4.605   [0m | [0m 10.92   [0m | [0m 0.6669  [0m |
| [95m 3       [0m | [95m-2.421   [0m | [95m 0.6819  [0m | [95m 0.09504 [0m | [95m 7.375   [0m | [95m 11.44   [0m | [95m 3.839   [0m | [95m 20.02   [0m | [95m 0.9432  [0m |
| [95m 4       [0m | [95m-2.416   [0m | [95m 0.9306  [0m | [95m 0.09484 [0m | [95m 8.506   [0m | [95m 8.108   [0m | [95m 6.651   [0m | [95m 2.126   [0m | [95m 0.2399  [0m |
| [0m 5       [0m | [0m-2.499  

In [None]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'],   
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=train_features, y_data=train_target, n_splits=5, output='model')

In [None]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=train_features.columns)

In [None]:
preds = []
for model in models:
    pred = model.predict(temp)
    preds.append(pred)

pred = np.mean(preds, axis=0)

In [None]:
temp['AMT'] = pred

In [None]:
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [None]:
sub = sub.drop(['AMT'], axis=1)
sub = sub.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
sub.index.name = 'id'
sub.to_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/제주/submission_baseline_1(6).csv', encoding='utf-8-sig')
sub.head()

Unnamed: 0_level_0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,202004,강원,건강보조식품 소매업,16660.0
1,1,202004,강원,골프장 운영업,16660.0
2,2,202004,강원,과실 및 채소 소매업,16660.0
3,3,202004,강원,관광 민예품 및 선물용품 소매업,16660.0
4,4,202004,강원,그외 기타 분류안된 오락관련 서비스업,16660.0


오버피팅이 심하게 되는 것으로 보임. 독립변수들을 원핫인코딩으로 바꿔주는 것이 좋을 것 같음.

# Base Line 2

In [None]:
data.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


In [None]:
data.drop(['CARD_CCG_NM','HOM_CCG_NM','CSTMR_CNT','CNT'],axis=1,inplace=True)

In [None]:
data=data.groupby(['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','AGE','SEX_CTGO_CD','FLC'])['AMT'].sum().reset_index()

In [None]:
data['year']=data['REG_YYMM'].apply(lambda x: int(str(x)[:4]))
data['month'] = data['REG_YYMM'].apply(lambda x: int(str(x)[4:]))
data.drop('REG_YYMM',axis=1,inplace=True)
data.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,AMT,year,month
0,강원,건강보조식품 소매업,강원,20s,1,1,311200,2019,1
1,강원,건강보조식품 소매업,강원,30s,1,2,5169295,2019,1
2,강원,건강보조식품 소매업,강원,30s,2,2,8618550,2019,1
3,강원,건강보조식품 소매업,강원,40s,1,2,454420,2019,1
4,강원,건강보조식품 소매업,강원,40s,1,3,6441165,2019,1


In [None]:
data.loc[(data['month']>=3) & (data['month']<=5),'season'] = 'spring'
data.loc[(data['month']>=6) & (data['month']<8),'season'] = 'summer'
data.loc[(data['month']>=9) & (data['month']<=11),'season'] = 'fall'
data.loc[(data['month']==12) | (data['month']<=2),'season'] = 'winter'

In [None]:
for col in data.columns:
  if col !='AMT':
    data[col] = data[col].astype('category')
data.dtypes

CARD_SIDO_NM    category
STD_CLSS_NM     category
HOM_SIDO_NM     category
AGE             category
SEX_CTGO_CD     category
FLC             category
AMT                int64
year            category
month           category
season          category
dtype: object

In [None]:
x_data = data.drop('AMT',axis=1)
y_data = np.log1p(data['AMT'])

In [None]:
def lgb_cv(max_depth,num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha,reg_lambda, x_data=None, y_data=None, n_splits=5, output='model'):
    score = 0
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=201)
    models = []
    for train_index, valid_index in skf.split(x_data,x_data['STD_CLSS_NM']):
       #print(train_index)
       x_train, y_train = x_data.iloc[train_index], y_data.iloc[train_index]
       x_valid, y_valid = x_data.iloc[valid_index], y_data.iloc[valid_index]
        
       model = lgb.LGBMRegressor(
               
           num_leaves = int(num_leaves), 
           learning_rate = learning_rate, 
           n_estimators = int(n_estimators), 
           subsample = np.clip(subsample, 0, 1), 
           colsample_bytree = np.clip(colsample_bytree, 0, 1), 
           reg_alpha = reg_alpha, 
           reg_lambda = reg_lambda,
           )
       
       model.fit(x_train, y_train)
       models.append(model)
        
       pred = model.predict(x_valid)
       true = y_valid
       RMSLE = (mean_squared_log_error(true, pred))*(-1)
       score += RMSLE/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
func_fixed = partial(lgb_cv, x_data=x_data, y_data=y_data, n_splits=5, output='score') 

lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'max_depth' : (4,8),
        'num_leaves': (4, 8),
        'learning_rate': (0.0001, 0.1),
        'n_estimators': (4, 8),
        'subsample': (0.01, 1),
        'colsample_bytree': (0.01, 1),
        'reg_alpha': (0.01, 10), 
        'reg_lambda': (0.01, 50),
    }, 
    random_state=4321
)
lgbBO.maximize(init_points=15, n_iter=5) #30  

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.02636 [0m | [0m 0.08009 [0m | [0m 0.08152 [0m | [0m 7.072   [0m | [0m 5.145   [0m | [0m 4.772   [0m | [0m 9.789   [0m | [0m 20.32   [0m | [0m 0.7602  [0m |
| [0m 2       [0m | [0m-0.02636 [0m | [0m 0.09826 [0m | [0m 0.03106 [0m | [0m 6.476   [0m | [0m 5.84    [0m | [0m 4.873   [0m | [0m 6.639   [0m | [0m 33.94   [0m | [0m 0.9508  [0m |
| [95m 3       [0m | [95m-0.02623 [0m | [95m 0.2884  [0m | [95m 0.06202 [0m | [95m 5.533   [0m | [95m 5.601   [0m | [95m 7.771   [0m | [95m 9.3     [0m | [95m 47.42   [0m | [95m 0.3817  [0m |
| [95m 4       [0m | [95m-0.02621 [0m | [95m 0.3489  [0m | [95m 0.06651 [0m | [95m 4.169   [0m | [95m 4.929   [0m | [95m 5.72  

In [None]:
params = lgbBO.max['params']
models = lgb_cv(
    params['max_depth'],
    params['num_leaves'],   
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=x_data, y_data=y_data, n_splits=5, output='model')

In [None]:
# data 템플릿 만드기
CARD_SIDO_NMs = data['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = data['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = data['HOM_SIDO_NM'].unique()
AGEs          = data['AGE'].unique()
SEX_CTGO_CDs  = data['SEX_CTGO_CD'].unique()
FLCs          = data['FLC'].unique()
years           = [2020]
months           = [4,7]
#seasons         = ['spring','summer']

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                      for year in years:
                        for month in months:
                          if month ==4:
                            season ='spring'
                          else:
                            season ='summer'                         
                          
                          temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC,year,month,season])

temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC','year','month','season'])

In [None]:
for col in temp.columns:
  temp[col] = temp[col].astype('category')

In [None]:
preds = []
for model in models:
    pred = model.predict(temp)
    preds.append(pred)

pred = np.mean(preds, axis=0)

temp['AMT'] = pred

In [None]:
temp['REG_YYMM']= temp['year'].astype('object') +temp['month'].astype('object')
temp['REG_YYMM'] = temp['REG_YYMM'].astype('int64')

In [None]:
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
temp = temp.reset_index()
temp['AMT'] = np.round(temp['AMT'], 0)
temp['CARD_SIDO_NM'] = temp['CARD_SIDO_NM'].astype('object')
temp['STD_CLSS_NM'] = temp['STD_CLSS_NM'].astype('object')
temp['AMT'] = temp['AMT'].astype('int64')
# temp.to_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/제주/submission_baseline_3(7).csv', encoding='utf-8-sig',index=False)
# temp.head()

In [None]:
sub = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/제주/submission.csv')
sub = sub.drop(['AMT'], axis=1)
sub.merge(temp,on=['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM'])

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,index,AMT


In [None]:
sub['AMT'] = temp['AMT']
sub.head()

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,0,202004,강원,건강보조식품 소매업,16817
1,1,202004,강원,골프장 운영업,17144
2,2,202004,강원,과실 및 채소 소매업,16711
3,3,202004,강원,관광 민예품 및 선물용품 소매업,16544
4,4,202004,강원,그외 기타 분류안된 오락관련 서비스업,16544


In [None]:
sub.to_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/제주/submission_baseline_3(10).csv', encoding='utf-8-sig',index=False)