<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#2.-여름" data-toc-modified-id="2.-여름-1">2. 여름</a></span></li><li><span><a href="#3.-가을" data-toc-modified-id="3.-가을-2">3. 가을</a></span></li><li><span><a href="#4.-겨울" data-toc-modified-id="4.-겨울-3">4. 겨울</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
import seaborn as sns
%matplotlib inline
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
import pickle
import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./data/total_data.csv')
print(data.shape)
data.head(3)

(2056899, 17)


Unnamed: 0,계절,월,일,요일,공휴일,성별,연령대,대분류명,소분류명,평균일강수량(mm),평균풍속(km/h),평균습도(%rh),일조합,체감온도(℃),일 미세먼지 농도(㎍/㎥),10만건당 건수,구매건수
0,겨울,1,1,0,1.0,F,20,식품,가공란,0.0,6.84,51.0,8.7,-2.810026,32.962963,0.480964,37
1,겨울,1,1,0,1.0,F,30,식품,가공란,0.0,6.84,51.0,8.7,-2.810026,32.962963,0.480964,16
2,겨울,1,1,0,1.0,F,40,식품,가공란,0.0,6.84,51.0,8.7,-2.810026,32.962963,0.480964,9


In [3]:
beauty = data[data['대분류명']=='뷰티']
beauty.drop(['대분류명','평균습도(%rh)','일조합','구매건수'], axis=1, inplace=True)
beauty.dropna(inplace=True)
beauty.reset_index(drop=True, inplace=True)
print(beauty.shape)
beauty.head(3)

(695969, 13)


Unnamed: 0,계절,월,일,요일,공휴일,성별,연령대,소분류명,평균일강수량(mm),평균풍속(km/h),체감온도(℃),일 미세먼지 농도(㎍/㎥),10만건당 건수
0,겨울,1,1,0,1.0,F,20,기능성 링클케어 화장품,0.0,6.84,-2.810026,32.962963,12.154295
1,겨울,1,1,0,1.0,F,40,기능성 링클케어 화장품,0.0,6.84,-2.810026,32.962963,12.154295
2,겨울,1,1,0,1.0,F,20,기능성 모공관리 화장품,0.0,6.84,-2.810026,32.962963,36.000828


대분류명, 평균습도(다중공선성 문제), 일조합(추후 데이터 수집 문제), 구매건수 변수 제외 및 결측치 제거

In [4]:
봄 = beauty[beauty['계절']=='봄'].reset_index(drop=True).drop('계절', axis=1)
여름 = beauty[beauty['계절']=='여름'].reset_index(drop=True).drop('계절', axis=1)
가을 = beauty[beauty['계절']=='가을'].reset_index(drop=True).drop('계절', axis=1)
겨울 = beauty[beauty['계절']=='겨울'].reset_index(drop=True).drop('계절', axis=1)

# 2. 여름

In [5]:
# 이상치 확인
def outliers(data):
    q1, q3 = np.percentile(data, [25,75])
    iqr = q3 - q1
    upper_bound = q3 + (iqr * 1.5)
    lower_bound = q1 - (iqr * 1.5)
    return np.where((data > upper_bound) | (data < lower_bound))

In [9]:
def preprocessing(data):
    
    # 이상치 처리
    강수량_idx = outliers(data['평균일강수량(mm)'])[0]
    미세먼지_idx = outliers(data['일 미세먼지 농도(㎍/㎥)'])[0]
    풍속_idx = outliers(data['평균풍속(km/h)'])[0]

    강수량_mean = data['평균일강수량(mm)'].mean()
    data.loc[강수량_idx, '평균일강수량(mm)'] = 강수량_mean

    미세먼지_mean = data['일 미세먼지 농도(㎍/㎥)'].mean()
    data.loc[미세먼지_idx, '일 미세먼지 농도(㎍/㎥)'] = 미세먼지_mean

    풍속_mean = data['평균풍속(km/h)'].mean()
    data.loc[풍속_idx, '평균풍속(km/h)'] = 풍속_mean
    
    # 타깃 값 로그 변환
    data['10만건당 건수'] = np.log1p(data["10만건당 건수"])
    
    # 범주형 변수 레이블 인코딩
    le = LabelEncoder()
    le = le.fit(data['성별'])
    data['성별'] = le.transform(data['성별'])

    le2 = LabelEncoder()
    le2 = le2.fit(data['소분류명'])
    data['소분류명'] = le2.transform(data['소분류명'])

    label_df = data[['성별','소분류명']]
    
    # 연속형 변수 스케일링
    nu = data.drop(['성별','소분류명','10만건당 건수'], axis=1)

    scaler = StandardScaler()
    scaler = scaler.fit(nu)
    scaled = scaler.transform(nu)
    scaled_df = pd.DataFrame(scaled, columns=nu.columns)
    
    # 데이터셋 분리
    X = pd.concat([scaled_df, label_df], axis=1)
    y = data['10만건당 건수']

    return le, le2, X, y

In [10]:
def etr_modeling(data, model_name):
    
    le, le2, X, y = preprocessing(data)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        shuffle=True, random_state=11)
    
    # Extra Trees Regressor 모델 학습
    param_grid = {'n_estimators':[100,200],'max_features':[4,8,11],'min_samples_split':[2,4,8]}
    grid = GridSearchCV(ExtraTreesRegressor(), param_grid=param_grid,
                        scoring=['neg_mean_squared_error'], refit='neg_mean_squared_error',
                        return_train_score=True, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    print('Best Params : ', grid.best_params_)
    print('RMSE : {0:.3f}'.format(np.sqrt(-1 * grid.best_score_)))
    
    model = grid.best_estimator_

    # 예측 모델 저장
    joblib.dump(model, f'./model/ExtraTreesRegressor({model_name})') 

In [7]:
etr_modeling(여름, '여름')

Best Params :  {'max_features': 11, 'min_samples_split': 2, 'n_estimators': 100}
RMSE : 0.082


# 3. 가을

In [8]:
etr_modeling(가을, '가을')

Best Params :  {'max_features': 11, 'min_samples_split': 2, 'n_estimators': 200}
RMSE : 0.076


# 4. 겨울

In [11]:
etr_modeling(겨울, '겨울')

Best Params :  {'max_features': 11, 'min_samples_split': 4, 'n_estimators': 200}
RMSE : 0.082
