RandomForest Regression - 계절 별

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('./data/total_data.csv')
data = data[data['대분류명']=='뷰티']
data.head()

Unnamed: 0,계절,월,일,요일,공휴일,성별,연령대,대분류명,소분류명,평균일강수량(mm),평균풍속(km/h),평균습도(%rh),일조합,체감온도(℃),일 미세먼지 농도(㎍/㎥),10만건당 건수,구매건수
389,겨울,1,1,0,1.0,F,20,뷰티,기능성 링클케어 화장품,0.0,6.84,51.0,8.7,-2.810026,32.962963,12.154295,2
390,겨울,1,1,0,1.0,F,40,뷰티,기능성 링클케어 화장품,0.0,6.84,51.0,8.7,-2.810026,32.962963,12.154295,1
391,겨울,1,1,0,1.0,F,20,뷰티,기능성 모공관리 화장품,0.0,6.84,51.0,8.7,-2.810026,32.962963,36.000828,7
392,겨울,1,1,0,1.0,F,30,뷰티,기능성 모공관리 화장품,0.0,6.84,51.0,8.7,-2.810026,32.962963,36.000828,3
393,겨울,1,1,0,1.0,F,40,뷰티,기능성 모공관리 화장품,0.0,6.84,51.0,8.7,-2.810026,32.962963,36.000828,1


타깃 변수 로그 변환

In [3]:
data['구매건수'] = np.log1p(data['구매건수'])

In [4]:
data.shape

(697853, 17)

In [5]:
data = data.drop(['대분류명','평균습도(%rh)','일조합'], axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697853 entries, 389 to 2056849
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   계절              697853 non-null  object 
 1   월               697853 non-null  int64  
 2   일               697853 non-null  int64  
 3   요일              697853 non-null  int64  
 4   공휴일             697853 non-null  float64
 5   성별              697853 non-null  object 
 6   연령대             697853 non-null  int64  
 7   소분류명            697853 non-null  object 
 8   평균일강수량(mm)      697853 non-null  float64
 9   평균풍속(km/h)      697853 non-null  float64
 10  체감온도(℃)         697853 non-null  float64
 11  일 미세먼지 농도(㎍/㎥)  695969 non-null  float64
 12  10만건당 건수        697853 non-null  float64
 13  구매건수            697853 non-null  float64
dtypes: float64(7), int64(4), object(3)
memory usage: 79.9+ MB


In [6]:
data.isnull().sum()

계절                   0
월                    0
일                    0
요일                   0
공휴일                  0
성별                   0
연령대                  0
소분류명                 0
평균일강수량(mm)           0
평균풍속(km/h)           0
체감온도(℃)              0
일 미세먼지 농도(㎍/㎥)    1884
10만건당 건수             0
구매건수                 0
dtype: int64

라벨인코딩

In [7]:
data = data.dropna().reset_index(drop=True)
data['성별'] = data['성별'].replace(['F','M'],[0,1])
data['계절'] = data['계절'].replace(['봄','여름','가을','겨울'],[0,1,2,3])
encoder = LabelEncoder()
data['소분류명'] = encoder.fit_transform(data['소분류명'])

In [8]:
봄 = data[data['계절']==0].reset_index(drop=True)
여름 = data[data['계절']==1].reset_index(drop=True)
가을 = data[data['계절']==2].reset_index(drop=True)
겨울 = data[data['계절']==3].reset_index(drop=True)
print(봄.shape, 여름.shape, 가을.shape, 겨울.shape)

(178079, 14) (177458, 14) (169428, 14) (171004, 14)


이상치 대체 함수

In [9]:
def remove_outlier(data,column):
    df = data[column]
    # 1분위수
    quan_25 = np.percentile(df.values, 25)    
    # 3분위수
    quan_75 = np.percentile(df.values, 75)    
    iqr = quan_75 - quan_25   
    lowest = quan_25 - iqr * 1.5
    highest = quan_75 + iqr * 1.5
    outlier_index = df[(df < lowest) | (df > highest)].index
    print('outlier의 수 : ' , len(outlier_index))
    # 평균으로 대체
    df[outlier_index]=df.mean()    
    return data

In [10]:
print(봄.shape, 여름.shape, 가을.shape, 겨울.shape)
candidate = ['평균일강수량(mm)', '평균풍속(km/h)', '일 미세먼지 농도(㎍/㎥)']
for cand in candidate:  
    봄 = remove_outlier(봄,cand)
    여름 = remove_outlier(여름,cand)
    가을 = remove_outlier(가을,cand)
    겨울 = remove_outlier(겨울,cand)
print(봄.shape, 여름.shape, 가을.shape, 겨울.shape)

(178079, 14) (177458, 14) (169428, 14) (171004, 14)
outlier의 수 :  34801
outlier의 수 :  25012
outlier의 수 :  28961
outlier의 수 :  35392
outlier의 수 :  5767
outlier의 수 :  4690
outlier의 수 :  10356
outlier의 수 :  947
outlier의 수 :  7724
outlier의 수 :  0
outlier의 수 :  10530
outlier의 수 :  4708
(178079, 14) (177458, 14) (169428, 14) (171004, 14)


rmsle 계산 함수

In [11]:
def rmsle(y_pred, y_test) :
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_test))**2))

In [12]:
df = pd.concat([봄,여름,가을,겨울]).reset_index(drop=True)
df.shape

(695969, 14)

In [13]:
봄 = data[data['계절']==0].reset_index(drop=True)
여름 = data[data['계절']==1].reset_index(drop=True)
가을 = data[data['계절']==2].reset_index(drop=True)
겨울 = data[data['계절']==3].reset_index(drop=True)
print(봄.shape, 여름.shape, 가을.shape, 겨울.shape)

(178079, 14) (177458, 14) (169428, 14) (171004, 14)


봄 RandomForest

In [19]:
spring_X = 봄.drop(['구매건수'], axis=1, inplace=False)
spring_y = 봄['구매건수']

spring_X_train, spring_X_test, spring_y_train, spring_y_test = train_test_split(spring_X, spring_y,
                                                                        test_size=0.3,
                                                                        random_state=42)

In [20]:
spring_rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
spring_rf.fit(spring_X_train,spring_y_train)
spring_pred = spring_rf.predict(spring_X_test)
print('RMSE :', mean_squared_error(spring_pred, spring_y_test)**0.5)

RMSE : 0.4724293849522659


In [23]:
print('RMSLE :', rmsle(spring_pred, spring_y_test))

RMSLE : 0.1701834719104395


여름 RandomForest

In [21]:
summer_X = 여름.drop(['구매건수'], axis=1, inplace=False)
summer_y = 여름['구매건수']

summer_X_train, summer_X_test, summer_y_train, summer_y_test = train_test_split(summer_X, summer_y,
                                                                        test_size=0.3,
                                                                        random_state=42)

In [22]:
summer_rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
summer_rf.fit(summer_X_train,summer_y_train)
summer_pred = summer_rf.predict(summer_X_test)
print('RMSE :', mean_squared_error(summer_pred, summer_y_test)**0.5)

RMSE : 0.43179087558703316


In [24]:
print('RMSLE :', rmsle(summer_pred, summer_y_test))

RMSLE : 0.16126974114203585


가을 RandomForest

In [15]:
fall_X = 가을.drop(['구매건수'], axis=1, inplace=False)
fall_y = 가을['구매건수']

fall_X_train, fall_X_test, fall_y_train, fall_y_test = train_test_split(fall_X, fall_y,
                                                                        test_size=0.3,
                                                                        random_state=42)

In [16]:
fall_rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
fall_rf.fit(fall_X_train,fall_y_train)
fall_pred = fall_rf.predict(fall_X_test)
print('RMSE :', mean_squared_error(fall_pred, fall_y_test)**0.5)

RMSE : 0.4258987449465712


In [25]:
print('RMSLE :', rmsle(fall_pred, fall_y_test))

RMSLE : 0.15903517298887615


겨울 RandomForest

In [17]:
winter_X = 겨울.drop(['구매건수'], axis=1, inplace=False)
winter_y = 겨울['구매건수']

winter_X_train, winter_X_test, winter_y_train, winter_y_test = train_test_split(winter_X, winter_y,
                                                                        test_size=0.3,
                                                                        random_state=42)

In [18]:
winter_rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
winter_rf.fit(winter_X_train,winter_y_train)
winter_pred = winter_rf.predict(winter_X_test)
print('RMSE :', mean_squared_error(winter_pred, winter_y_test)**0.5)

RMSE : 0.4381070739255093


In [26]:
print('RMSLE :', rmsle(winter_pred, winter_y_test))

RMSLE : 0.16208348509198328
