# 1. 라이브러리 선언

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 분석알고리즘 DecisionTree 구현 라이브러리
from sklearn.tree import DecisionTreeRegressor # 설명력이 뛰어남

In [3]:
# 과거데이터를 8:2, 7:3 으로 자동으로 나누어주는 함수
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
# Regression  평가 대표 지표 MAE, RMSE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
featuresData = \
    pd.read_csv("../../dataset/feature_regression_example.csv")

In [10]:
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

# 1. 데이터 전처리

##### 1-1 타입 통합 / 특성 숫자컬럼 추가

##### 1-1-1 데이터 통합

In [11]:
featuresData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   REGIONID      105 non-null    object 
 1   PRODUCTGROUP  105 non-null    object 
 2   PRODUCT       105 non-null    object 
 3   ITEM          105 non-null    object 
 4   YEARWEEK      105 non-null    int64  
 5   YEAR          105 non-null    int64  
 6   WEEK          105 non-null    int64  
 7   QTY           105 non-null    int64  
 8   HOLIDAY       105 non-null    object 
 9   HCLUS         105 non-null    int64  
 10  PROMOTION     105 non-null    object 
 11  PRO_PERCENT   105 non-null    float64
dtypes: float64(1), int64(5), object(6)
memory usage: 10.0+ KB


In [12]:
# 주의할 사항은 모든 컬럼에 대해서 고정시키는걸 고려하는게 나을수도 있다.
featuresData.QTY = featuresData.QTY.astype(float)

##### 1-1-2 특성 값 숫자컬럼 변경

In [13]:
featuresData.columns

Index(['REGIONID', 'PRODUCTGROUP', 'PRODUCT', 'ITEM', 'YEARWEEK', 'YEAR',
       'WEEK', 'QTY', 'HOLIDAY', 'HCLUS', 'PROMOTION', 'PRO_PERCENT'],
      dtype='object')

In [14]:
holiEn = LabelEncoder()

In [16]:
featuresData["HOLIDAY_EN"] = \
    holiEn.fit_transform(featuresData.HOLIDAY)

In [12]:
# featuresData["HOLIDAY_NEW"] = \
#     np.where(featuresData.HOLIDAY == "Y",1,0)

# pd.DataFrame( featuresData.HOLIDAY.drop_duplicates() ).reset_index().\
#     rename(columns = {"index" : "HOLIDAY_NEW"})

In [17]:
featuresData["HOLIDAY_LABEL_EN"] = holiEn.fit_transform(featuresData.HOLIDAY)

In [18]:
featuresData["HOLIDAY_DE"] = \
    holiEn.inverse_transform(featuresData.HOLIDAY_LABEL_EN)

In [19]:
featuresData.loc[featuresData.HOLIDAY != featuresData.HOLIDAY_DE]

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HOLIDAY_EN,HOLIDAY_LABEL_EN,HOLIDAY_DE


##### 1-2-1 특성 선정/ 데이터 분리

In [16]:
# 상관갯수 보는 함수.
corrDf = featuresData.corr()  # 기본 값은 피어쓴으로 되어있음.

In [17]:
standardLimit = 0.5

In [18]:
# 문제지
features = \
    list(corrDf.loc[ (abs(corrDf.QTY) > standardLimit) &
                    (corrDf.QTY != 1)].index )

In [19]:
label = ["QTY"]  # 정답지

##### 1-2-2 데이터 분리

In [20]:
featuresData.shape

(105, 15)

In [27]:
standardIndex = 0.8

In [28]:
# 과거 데이터를 7:3 or 8:2 로 분리를 해야함.
sortKey = ["REGIONID", "ITEM", "YEARWEEK"]

In [29]:
sortedData = featuresData.sort_values(sortKey, ignore_index=True)

In [31]:
standardIndex = int(list( sortedData.shape)[0]* standardIndex)

In [32]:
yearweekStd = sortedData.loc[standardIndex].YEARWEEK

In [33]:
yearweekStd

201632

In [40]:
# 훈련데이터와 테스트데이터를 (문제지와 정답지로 구분해서 정의한다.)
trainingDataFeatures = \
    sortedData.loc[sortedData.YEARWEEK <= yearweekStd, features]
trainingDataLabel = \
    sortedData.loc[sortedData.YEARWEEK <= yearweekStd, label]
testDataFeatures = \
    sortedData.loc[sortedData.YEARWEEK <= yearweekStd, features]
testDataLabel = \
    sortedData.loc[sortedData.YEARWEEK <= yearweekStd, label]

In [42]:
trainingDataLabel

Unnamed: 0,QTY
0,1225.0
1,968.0
2,1209.0
3,1810.0
4,1773.0
...,...
80,1616.0
81,1318.0
82,2240.0
83,1522.0


### 2.모델 적용

##### 2-1-1 학습

In [43]:
# random_state 를 고정시키지 않으면 예측 값이 계속 바뀜
model = DecisionTreeRegressor(random_state=10)

In [45]:
model.fit(X = trainingDataFeatures, y = trainingDataLabel)

DecisionTreeRegressor(random_state=10)

### 3. 예측

In [48]:
predictValue = model.predict(testDataFeatures)

In [49]:
predictDf = \
    pd.DataFrame( list(predictValue ), columns = ["PREDICT"])

Unnamed: 0,QTY,PREDICT
0,1225.0,1305.750000
1,968.0,900.500000
2,1209.0,1009.666667
3,1810.0,1810.000000
4,1773.0,1009.666667
...,...,...
80,1616.0,1434.600000
81,1318.0,1434.600000
82,2240.0,1708.750000
83,1522.0,1708.750000


### 4. 데이터 정리

In [56]:
validateDf = \
    pd.concat( [testDataLabel, predictDf], axis=1)

In [58]:
### 5. 정확도 검증

In [63]:
MAE = mean_absolute_error(y_true=validateDf.QTY,
                   y_pred=validateDf.PREDICT)

In [64]:
RMSE = np.sqrt( mean_squared_error(y_true=validateDf.QTY,
                           y_pred=validateDf.PREDICT))

In [None]:
### DecisionTree => 과거의 겸험치를 그대로 반영
### q   변동성이 큰 데이터에서 강하다
### 오버피팅  [너무 과거에 얽매여서] 다른 일을 못잡음
### randomForest -> 장점: DecisionTree 오버피팅 해결하고
###  단점 : 설명력이 좀 어렵다

In [60]:
validateDf

Unnamed: 0,QTY,PREDICT
0,1225.0,1305.750000
1,968.0,900.500000
2,1209.0,1009.666667
3,1810.0,1810.000000
4,1773.0,1009.666667
...,...,...
80,1616.0,1434.600000
81,1318.0,1434.600000
82,2240.0,1708.750000
83,1522.0,1708.750000
