### Knock71: Directory 生成して Data 読み込みの準備をしよう
まず初回に考えること => Directory 構造

In [1]:
# Directory 作成
import os

data_dir = 'data'
input_dir = os.path.join(data_dir, '0_input')
output_dir = os.path.join(data_dir, '1_output')
master_dir = os.path.join(data_dir, '99_master')
model_dir = 'models'

os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(master_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

### Knock72: 予測したい新規 Data を読み込もう

In [2]:
# Master data の読み込み
import pandas as pd

m_area_file = 'm_area.csv'
m_store_file = 'm_store.csv'
m_area = pd.read_csv(os.path.join(master_dir, m_area_file))
m_store = pd.read_csv(os.path.join(master_dir, m_store_file))

新規 Data の読み込みには、簡易的な **Data check 機構** を入れる。

In [3]:
# Data check 機構を用いた新規 Data の読み込み
tg_ym = '202003'
target_file = 'tbl_order_' + tg_ym + '.csv'
target_data = pd.read_csv(os.path.join(input_dir, target_file))

import datetime

max_date = pd.to_datetime(target_data['order_accept_date']).max()
min_date = pd.to_datetime(target_data['order_accept_date']).min()
max_str_date = max_date.strftime('%Y%m')
min_str_date = min_date.strftime('%Y%m')

if tg_ym == min_str_date and tg_ym == max_str_date:
    print("日付が一致しました")
else:
    raise Exception("日付が一致しません")

日付が一致しました


### Knock73: 新規 Data を店舗別で集計しよう
- 機械学習で予測するためには、Model 構築時の説明変数 X の Data を加工する必要がある。
- 新規 Data は未知な Data のため、目的変数の作成や紐づけは不要
- まずは集計を行なう。

In [4]:
# 店舗別集計を行なうための関数
def calc_delta(t):
    t1, t2 = t
    delta = t2 - t1
    return delta.total_seconds() / 60


def data_processing(order_data):
    order_data = order_data.loc[order_data['store_id'] != 999]
    order_data = pd.merge(order_data, m_store, on='store_id', how='left')
    order_data = pd.merge(order_data, m_area, on='area_cd', how='left')
    order_data.loc[order_data['takeout_flag'] == 0, 'takeout_name'] = 'デリバリー'
    order_data.loc[order_data['takeout_flag'] == 1, 'takeout_name'] = 'お持ち帰り'
    order_data.loc[order_data['status'] == 0, 'status_name'] = '受付'
    order_data.loc[order_data['status'] == 1, 'status_name'] = 'お支払済'
    order_data.loc[order_data['status'] == 2, 'status_name'] = 'お渡し済'
    order_data.loc[order_data['status'] == 9, 'status_name'] = 'キャンセル'

    order_data.loc[:, 'order_accept_datetime'] = pd.to_datetime(order_data['order_accept_date'])
    order_data.loc[:, 'delivered_datetime'] = pd.to_datetime(order_data['delivered_date'])
    order_data.loc[:, 'delta'] = order_data[['order_accept_datetime', 'delivered_datetime']].apply(calc_delta, axis=1)
    order_data.loc[:, 'order_accept_hour'] = order_data['order_accept_datetime'].dt.hour
    order_data.loc[:, 'order_accept_weekday'] = order_data['order_accept_datetime'].dt.weekday
    order_data.loc[order_data['order_accept_weekday'] >= 5, 'weekday_info'] = '休日'
    order_data.loc[order_data['order_accept_weekday'] < 5, 'weekday_info'] = '平日'

    store_data = order_data.groupby(['store_name']).count()[['order_id']]
    store_f = order_data.loc[
        (order_data['status_name'] == 'お渡し済')
        | (order_data['status_name'] == 'お支払済')
        ].groupby(['store_name']).count()[['order_id']]
    store_c = order_data.loc[order_data['status_name'] == 'キャンセル'].groupby(['store_name']).count()[['order_id']]
    store_d = order_data.loc[order_data['takeout_name'] == 'デリバリー'].groupby(['store_name']).count()[['order_id']]
    store_t = order_data.loc[order_data['takeout_name'] == 'お持ち帰り'].groupby(['store_name']).count()[['order_id']]
    store_weekday = order_data.loc[order_data['weekday_info'] == '平日'].groupby(['store_name']).count()[['order_id']]
    store_weekend = order_data.loc[order_data['weekday_info'] == '休日'].groupby(['store_name']).count()[['order_id']]

    times = order_data['order_accept_hour'].unique()
    store_time = []
    for time in times:
        time_tmp = order_data.loc[order_data['order_accept_hour'] == time].groupby(['store_name']).count()[['order_id']]
        time_tmp.columns = [f"order_time_{time}"]
        store_time.append(time_tmp)
    store_time = pd.concat(store_time, axis=1)

    store_delta = order_data.loc[order_data['status_name'] != 'キャンセル'].groupby(['store_name']).mean()[['delta']]

    store_data.columns = ['order']
    store_f.columns = ['order_fin']
    store_c.columns = ['order_cancel']
    store_d.columns = ['order_delivery']
    store_t.columns = ['order_takeout']
    store_delta.columns = ['delta_avg']
    store_weekday.columns = ['order_weekday']
    store_weekend.columns = ['order_weekend']
    store_data = pd.concat(
        [store_data, store_f, store_c, store_d, store_t, store_weekday, store_weekend, store_time, store_delta], axis=1)
    return store_data

In [5]:
# 店舗別集計関数の実行
store_data = data_processing(target_data)
store_data.reset_index(drop=False, inplace=True)
actual_data = store_data.copy()

### Knock74: 新規 Data の Categorical 変数対応をしよう
One-hot-encoding

In [6]:
# Categorical 変数の対応
category_data = pd.get_dummies(store_data['store_name'], prefix='store', prefix_sep='_')
del category_data['store_麻生店']
store_data = pd.concat([store_data, category_data], axis=1)
store_data.head(3)

Unnamed: 0,store_name,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekday,order_weekend,order_time_11,order_time_12,...,store_駒沢店,store_駒込店,store_高円寺店,store_高島平店,store_高崎店,store_高座店,store_高津店,store_高田馬場店,store_鴻巣店,store_鶴見店
0,あきる野店,1186,958,228,858,328,839,347,104,107,...,0,0,0,0,0,0,0,0,0,0
1,さいたま南店,1553,1266,287,1145,408,1103,450,165,136,...,0,0,0,0,0,0,0,0,0,0
2,さいたま緑店,1063,875,188,807,256,754,309,113,82,...,0,0,0,0,0,0,0,0,0,0


### Model に投入する直前の形式に整えよう

In [7]:
# Model に使用した説明変数の読み込み
X_cols_name = 'X_cols.csv'
X_cols = pd.read_csv(os.path.join(model_dir, X_cols_name))
X_cols = X_cols['X_cols']

In [8]:
# 説明変数列への絞り込み
X = store_data[X_cols].copy()
X.head(3)

Unnamed: 0,order,order_fin,order_cancel,order_delivery,order_takeout,order_weekday,order_weekend,order_time_11,order_time_12,order_time_13,...,store_駒沢店,store_駒込店,store_高円寺店,store_高島平店,store_高崎店,store_高座店,store_高津店,store_高田馬場店,store_鴻巣店,store_鶴見店
0,1186,958,228,858,328,839,347,104,107,109,...,0,0,0,0,0,0,0,0,0,0
1,1553,1266,287,1145,408,1103,450,165,136,140,...,0,0,0,0,0,0,0,0,0,0
2,1063,875,188,807,256,754,309,113,82,79,...,0,0,0,0,0,0,0,0,0,0


説明変数の Data を準備しておき、Model 構築時にその Data を出力し、そこから絞り込みを行なうようにしておくと、もし Model の見直し等で説明変数が変わっても、ある程度対応が可能となる。

### Knock76: Model File を読み込んでみよう

In [9]:
# Model File の読み込み
import pickle

model_weekday_name = 'model_y_weekday_GradientBoosting.pickle'
model_weekend_name = 'model_y_weekend_GradientBoosting.pickle'

model_weekday_path = os.path.join(model_dir, model_weekday_name)
model_weekend_path = os.path.join(model_dir, model_weekend_name)

with open(model_weekday_path, mode='rb') as f:
    model_weekday = pickle.load(f)

with open(model_weekend_path, mode='rb') as f:
    model_weekend = pickle.load(f)

print(model_weekday)
print(model_weekend)

GradientBoostingClassifier(random_state=0)
GradientBoostingClassifier(random_state=0)


定義した Model を print で出力すると Model 構築を行なった際の Parameter 情報が出力される。

### Knock77: 新規 Data を予測してみよう

In [10]:
# 予測結果の出力
pred_weekdary = model_weekday.predict(X)
pred_weekend = model_weekend.predict(X)
pred_weekend[:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])

`predict_proba()` を用いると確率を表示できる。

In [11]:
# 予測確率の出力
pred_proba_weekday = model_weekday.predict_proba(X)
pred_proba_weekend = model_weekend.predict_proba(X)
pred_proba_weekend[:10]

array([[0.71866699, 0.28133301],
       [0.61231504, 0.38768496],
       [0.53604944, 0.46395056],
       [0.77746324, 0.22253676],
       [0.59048016, 0.40951984],
       [0.80779612, 0.19220388],
       [0.79061865, 0.20938135],
       [0.70969261, 0.29030739],
       [0.36925277, 0.63074723],
       [0.84098724, 0.15901276]])

- 左側は 0 と予測している確率、右側は 1 と予測している確率で、足すと 1 になる。
- `predict()` は、この確率が 0.5 を超えている方を出力している。

In [12]:
# 予測確率の出力（ 1 のみ)
pred_proba_weekday = pred_proba_weekday[:, 1]
pred_proba_weekend = pred_proba_weekend[:, 1]
pred_proba_weekend[:10]

array([0.28133301, 0.38768496, 0.46395056, 0.22253676, 0.40951984,
       0.19220388, 0.20938135, 0.29030739, 0.63074723, 0.15901276])

こうすることで 1 である（目的変数が増加する）確率として細かく閾値等を設定することが可能になる。

In [13]:
# 予測結果や確率の Data 化
pred = pd.DataFrame({'pred_weekday': pred_weekdary, 'pred_weekend': pred_weekend, 'score_weekday': pred_proba_weekday,
                     'score_weekend': pred_proba_weekend})
pred.loc[:, 'store_name'] = store_data['store_name']  # 機械学習 Model による予測前と予測結果の Data の順番が違う場合は使用不可
pred.loc[:, 'year_month'] = tg_ym
pred.head(3)

Unnamed: 0,pred_weekday,pred_weekend,score_weekday,score_weekend,store_name,year_month
0,1.0,0.0,0.769104,0.281333,あきる野店,202003
1,1.0,0.0,0.677146,0.387685,さいたま南店,202003
2,1.0,0.0,0.842885,0.463951,さいたま緑店,202003
