In [19]:
import warnings

warnings.filterwarnings('ignore')
import pandas as pd

customer = pd.read_csv('sample_code/chapter_4/customer_join.csv')
uselog_months = pd.read_csv('dump/use_log_months.csv')

機械学習用の Data 加工は当月と１ヶ月前の Data の利用履歴のみの Data を作成する。
<small>
理由
- 過去６ヶ月分の Data から予測する場合、５ヶ月以内の退会は予測できない。
- ほんの数ヶ月で辞めてしまう顧客も多いので、過去６ヶ月分の Data からの予測では意味がない。

In [20]:
year_months = list(uselog_months['年月'].unique())  # 年月を List化
uselog = pd.DataFrame()
for i in range(1, len(year_months)):
    tmp = uselog_months.loc[uselog_months['年月'] == year_months[i]]
    tmp.rename(columns={'count': 'count_0'}, inplace=True)
    tmp_before = uselog_months.loc[uselog_months['年月'] == year_months[i - 1]]
    del tmp_before['年月']
    tmp_before.rename(columns={'count': 'count_1'}, inplace=True)
    tmp = pd.merge(tmp, tmp_before, on='customer_id', how='left')
    uselog = pd.concat([uselog, tmp], ignore_index=True)

uselog.head()

Unnamed: 0,年月,customer_id,count_0,count_1
0,201805,AS002855,5,4.0
1,201805,AS009373,4,3.0
2,201805,AS015233,7,
3,201805,AS015315,3,6.0
4,201805,AS015739,5,7.0


## 退会前月の Data を作成
退会当月の Data を用いても未然に防ぐことはできない為、前月の Data を用いて予測する。
### 理由・要因
1. 退会の予測をする目的
退会を未然に防ぐこと
2. Rule
当 Jim では、月末までに退会申請を提出することで、翌月末で退会できる。

In [21]:
from dateutil.relativedelta import relativedelta

exit_customer = customer.loc[customer['is_deleted'] == 1]
exit_customer['exit_date'] = None
exit_customer['end_date'] = pd.to_datetime(exit_customer['end_date'])
for i in range(len(exit_customer)):
    exit_customer['exit_date'].iloc[i] = exit_customer['end_date'].iloc[i] - relativedelta(months=1)
exit_customer['exit_date'] = pd.to_datetime(exit_customer['exit_date'])
exit_customer['年月'] = exit_customer['exit_date'].dt.strftime('%Y%m')
uselog['年月'] = uselog['年月'].astype(str)
exit_uselog = pd.merge(uselog, exit_customer, on=['customer_id', '年月'], how='left')
print(len(uselog))
exit_uselog.head()

33851


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,NaT
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,NaT
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,NaT
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,NaT
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,NaT


In [22]:
# 欠損値を除外する
exit_uselog = exit_uselog.dropna(subset=['name'])
display(len(exit_uselog))
display(len(exit_uselog['customer_id'].unique()))
exit_uselog.head()

1104

1104

Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
19,201805,AS055680,3,3.0,XXXXX,C01,M,2018-03-01,2018-06-30,CA1,...,10500.0,通常,3.0,3.0,3.0,3.0,0.0,2018-06-30,3.0,2018-05-30
57,201805,AS169823,2,3.0,XX,C01,M,2017-11-01,2018-06-30,CA1,...,10500.0,通常,3.0,3.0,4.0,2.0,1.0,2018-06-30,7.0,2018-05-30
110,201805,AS305860,5,3.0,XXXX,C01,M,2017-06-01,2018-06-30,CA1,...,10500.0,通常,3.333333,3.0,5.0,2.0,0.0,2018-06-30,12.0,2018-05-30
128,201805,AS363699,5,3.0,XXXXX,C01,M,2018-02-01,2018-06-30,CA1,...,10500.0,通常,3.333333,3.0,5.0,2.0,0.0,2018-06-30,4.0,2018-05-30
147,201805,AS417696,1,4.0,XX,C03,F,2017-09-01,2018-06-30,CA1,...,6000.0,通常,2.0,1.0,4.0,1.0,0.0,2018-06-30,9.0,2018-05-30


customer data は、end_date 以外に欠損値はないはず
    => customer data の名前列が欠損値であった場合 = 退会前月と結合できない不要な Data.

Data 件数と customer_id 列の unique 数が一致している為、顧客が辞める前月の状態を表している Data ということを確認。

# 継続顧客の Data を作成

In [23]:
conti_customer = customer.loc[customer['is_deleted'] == 0]
conti_uselog = pd.merge(uselog, conti_customer, on=['customer_id'], how='left')
display(len(conti_uselog))
conti_uselog = conti_uselog.dropna(subset=['name'])
display(len(conti_uselog))

33851

27422

Data | Data 数
--- | ---
退会 Data | 1104
継続顧客 Data | 27422

比較して退会 Data が少ない為、継続顧客の Data 全てを利用する場合、不均一な Data となってしまう。
=> Sample 数の調整が必要。

### 継続顧客も、顧客あたり１件になるよう Ander sampling を実施
Data を Shuffle して、重複を削除する方法をとる。

In [24]:
conti_uselog = conti_uselog.sample(frac=1).reset_index(drop=True)  # Data の Shuffle
conti_uselog = conti_uselog.drop_duplicates(subset='customer_id')  # customer_id が重複している Data は最初の Data のみ取得
display(len(conti_uselog))
conti_uselog.head()

2842

Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201810,AS190883,5,5.0,XXXXX,C03,M,2017-04-01,,CA1,...,ナイト,6000.0,通常,5.833333,6.0,8.0,5.0,1.0,2019-04-30,24.0
1,201812,AS741333,6,6.0,XXXXX,C01,M,2017-08-01,,CA2,...,オールタイム,10500.0,入会費半額,5.666667,6.0,7.0,4.0,1.0,2019-04-30,20.0
2,201811,HD666936,7,8.0,XXXX,C01,M,2017-11-01,,CA1,...,オールタイム,10500.0,通常,6.333333,6.5,11.0,2.0,1.0,2019-04-30,17.0
3,201811,IK228443,5,5.0,XX,C03,M,2016-03-01,,CA1,...,ナイト,6000.0,通常,5.25,5.5,7.0,3.0,1.0,2019-04-30,37.0
4,201807,TS130392,6,6.0,XXXX,C01,M,2016-06-01,,CA1,...,オールタイム,10500.0,通常,4.666667,5.0,7.0,2.0,1.0,2019-04-30,34.0


継続顧客 Data が完成したので、退会顧客 Data と縦結合する

In [25]:
predict_data = pd.concat([conti_uselog, exit_uselog], ignore_index=True)
print(len(predict_data))
predict_data.head()

3946


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201810,AS190883,5,5.0,XXXXX,C03,M,2017-04-01,,CA1,...,6000.0,通常,5.833333,6.0,8.0,5.0,1.0,2019-04-30,24.0,NaT
1,201812,AS741333,6,6.0,XXXXX,C01,M,2017-08-01,,CA2,...,10500.0,入会費半額,5.666667,6.0,7.0,4.0,1.0,2019-04-30,20.0,NaT
2,201811,HD666936,7,8.0,XXXX,C01,M,2017-11-01,,CA1,...,10500.0,通常,6.333333,6.5,11.0,2.0,1.0,2019-04-30,17.0,NaT
3,201811,IK228443,5,5.0,XX,C03,M,2016-03-01,,CA1,...,6000.0,通常,5.25,5.5,7.0,3.0,1.0,2019-04-30,37.0,NaT
4,201807,TS130392,6,6.0,XXXX,C01,M,2016-06-01,,CA1,...,10500.0,通常,4.666667,5.0,7.0,2.0,1.0,2019-04-30,34.0,NaT


時間的要素が入った在籍期間の列を追加する。

In [26]:
predict_data['period'] = 0
predict_data['now_date'] = pd.to_datetime(predict_data['年月'], format='%Y%m')
predict_data['start_date'] = pd.to_datetime(predict_data['start_date'])
for i in range(len(predict_data)):
    delta = relativedelta(predict_data['now_date'][i], predict_data['start_date'][i])
    predict_data['period'][i] = int(delta.years * 12 + delta.months)

predict_data.head()

Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,now_date
0,201810,AS190883,5,5.0,XXXXX,C03,M,2017-04-01,,CA1,...,5.833333,6.0,8.0,5.0,1.0,2019-04-30,24.0,NaT,18,2018-10-01
1,201812,AS741333,6,6.0,XXXXX,C01,M,2017-08-01,,CA2,...,5.666667,6.0,7.0,4.0,1.0,2019-04-30,20.0,NaT,16,2018-12-01
2,201811,HD666936,7,8.0,XXXX,C01,M,2017-11-01,,CA1,...,6.333333,6.5,11.0,2.0,1.0,2019-04-30,17.0,NaT,12,2018-11-01
3,201811,IK228443,5,5.0,XX,C03,M,2016-03-01,,CA1,...,5.25,5.5,7.0,3.0,1.0,2019-04-30,37.0,NaT,32,2018-11-01
4,201807,TS130392,6,6.0,XXXX,C01,M,2016-06-01,,CA1,...,4.666667,5.0,7.0,2.0,1.0,2019-04-30,34.0,NaT,25,2018-07-01


In [27]:
predict_data.isna().sum()

年月                      0
customer_id             0
count_0                 0
count_1               235
name                    0
class                   0
gender                  0
start_date              0
end_date             2842
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2842
period                  0
now_date                0
dtype: int64

継続顧客は、end_date, exit_date は持っていないので count_1 が欠損している Data を削除する。

In [28]:
predict_data = predict_data.dropna(subset=['count_1'])
predict_data.isna().sum()

年月                      0
customer_id             0
count_0                 0
count_1                 0
name                    0
class                   0
gender                  0
start_date              0
end_date             2659
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2659
period                  0
now_date                0
dtype: int64

予測に使用する Data を絞り込む

In [29]:
target_col = ['campaign_name', 'class_name', 'gender', 'count_1', 'routine_flg', 'period', 'is_deleted']
predict_data = predict_data[target_col]

predict_data.head()

Unnamed: 0,campaign_name,class_name,gender,count_1,routine_flg,period,is_deleted
0,通常,ナイト,M,5.0,1.0,18,0.0
1,入会費半額,オールタイム,M,6.0,1.0,16,0.0
2,通常,オールタイム,M,8.0,1.0,12,0.0
3,通常,ナイト,M,5.0,1.0,32,0.0
4,通常,オールタイム,M,6.0,1.0,25,0.0


Categorical 変数を用いて Dummy 変数を作成

In [30]:
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_入会費半額,campaign_name_入会費無料,campaign_name_通常,class_name_オールタイム,class_name_デイタイム,class_name_ナイト,gender_F,gender_M
0,5.0,1.0,18,0.0,0,0,1,0,0,1,0,1
1,6.0,1.0,16,0.0,1,0,0,1,0,0,0,1
2,8.0,1.0,12,0.0,0,0,1,1,0,0,0,1
3,5.0,1.0,32,0.0,0,0,1,0,0,1,0,1
4,6.0,1.0,25,0.0,0,0,1,1,0,0,0,1


不要な Dummy 変数の削除

In [31]:
del predict_data['campaign_name_通常']
del predict_data['class_name_ナイト']
del predict_data['gender_M']
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_入会費半額,campaign_name_入会費無料,class_name_オールタイム,class_name_デイタイム,gender_F
0,5.0,1.0,18,0.0,0,0,0,0,0
1,6.0,1.0,16,0.0,1,0,1,0,0
2,8.0,1.0,12,0.0,0,0,1,0,0
3,5.0,1.0,32,0.0,0,0,0,0,0
4,6.0,1.0,25,0.0,0,0,1,0,0


機械学習 Model を構築する準備が完了

退会予測 Model を作成。
※ 教師あり学習: 決定木

In [32]:
from sklearn.tree import DecisionTreeClassifier  # 決定木を使用するための Library を Import
import sklearn.model_selection  # 学習 Data と評価 Data の分割に必要な Library

exit = predict_data.loc[predict_data['is_deleted'] == 1]
conti = predict_data.loc[predict_data['is_deleted'] == 0].sample(len(exit))  # 退会 Data と継続 Data の数を Random に抽出して合わせている。

X = pd.concat([exit, conti], ignore_index=True)
y = X['is_deleted']  # 目的変数(予測に使う変数)
del X['is_deleted']  # 説明変数(予測したい変数)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)  # 学習 Data と評価 Data の分割

model = DecisionTreeClassifier(random_state=0)  # Model を定義
model.fit(X_train, y_train)  # 学習用 Data を指定し、Model の構築
y_test_pred = model.predict(X_test)  # 構築した Model を用いて評価 Data の予測
print('1: 退会 / 0: 継続')
print(y_test_pred)

1: 退会 / 0: 継続
[1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0.
 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1.
 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1.
 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1.
 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1.
 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 

In [33]:
print('正解との比較')
results_test = pd.DataFrame({'y_test': y_test, 'y_pred': y_test_pred})
results_test.head()

正解との比較


Unnamed: 0,y_test,y_pred
513,1.0,1.0
2096,0.0,0.0
976,1.0,1.0
1169,0.0,0.0
59,1.0,1.0


Model の Tuning

In [34]:
# 正解率を算出
# 正解 Data 数(y_test 列と y_pred 列が一致している) / 全体 Data 数
correct = len(results_test.loc[results_test['y_test'] == results_test['y_pred']])
data_count = len(results_test)
score_test = correct / data_count
print(score_test)

0.8878326996197718


In [35]:
# 学習用 Data で予測した精度と評価用 Data で予測した精度を比較
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.8878326996197718
0.9790874524714829


Data | 正解率
--- | ---
評価用 Data | 89%
学習用 Data | 98%

学習用 Data に適合しすぎており**過学習傾向**にあり。

In [37]:
X = pd.concat([exit, conti], ignore_index=True)
y = X['is_deleted']
del X['is_deleted']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)

model = DecisionTreeClassifier(random_state=0, max_depth=4) # 決定木の深さを max_depth で調整
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9258555133079848
0.917617237008872


Model を簡易化するおｋとで、未知の Data にも対応できる Model が完成。
