# 5章 顧客の退会を予測する１０本ノック

引き続き、スポーツジムの会員データを使って顧客の行動を分析していきます。  
３章では顧客の全体像を把握し、4章では数ヶ月利用している顧客の来月の利用回数の予測を行いました。   
ここでは、教師あり学習の分類を用いて、顧客の退会予測を取り扱います。

### ノック41：データを読み込んで利用データを整形しよう

In [262]:
import warnings
warnings.filterwarnings('ignore')

In [263]:
import pandas as pd
customer = pd.read_csv('customer_join.csv')
uselog_months = pd.read_csv('use_log_months.csv')

In [264]:
year_months = list(uselog_months['年月'].unique())
year_months

[201804,
 201805,
 201806,
 201807,
 201808,
 201809,
 201810,
 201811,
 201812,
 201901,
 201902,
 201903]

In [265]:
#利用回数を集計する
uselog = pd.DataFrame()
for i in range(1, len(year_months)):
    tmp = uselog_months.loc[uselog_months['年月'] == year_months[i]]
    tmp.rename(columns = {'count': 'count_0'}, inplace = True)
    tmp_before = uselog_months.loc[uselog_months['年月'] == year_months[i - 1]]
    del tmp_before['年月']
    tmp_before.rename(columns = {'count' : 'count_1'}, inplace = True)
    tmp = pd.merge(tmp, tmp_before, on = 'customer_id', how = 'left')
    uselog = pd.concat([uselog, tmp], ignore_index = True)
    
uselog.tail()

Unnamed: 0,年月,customer_id,count_0,count_1
33846,201903,TS995853,8,11.0
33847,201903,TS998593,8,7.0
33848,201903,TS999079,3,2.0
33849,201903,TS999231,6,6.0
33850,201903,TS999855,4,4.0


### ノック42：退会前月の退会顧客データを作成しよう

In [266]:
from dateutil.relativedelta import relativedelta
exit_customer = customer.loc[customer["is_deleted"]==1]
exit_customer["exit_date"] = None
exit_customer["exit_date"] = pd.to_datetime(exit_customer["exit_date"])
exit_customer["end_date"] = pd.to_datetime(exit_customer["end_date"])
for i in range(len(exit_customer)):
    exit_customer["exit_date"].iloc[i] = exit_customer["end_date"].iloc[i] - relativedelta(months=1)
exit_customer["年月"] = exit_customer["exit_date"].dt.strftime("%Y%m")
uselog["年月"] = uselog["年月"].astype(str)
exit_uselog = pd.merge(uselog, exit_customer, on=["customer_id", "年月"], how="left")
print(len(uselog))
exit_uselog.head()

33851


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,NaT
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,NaT
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,NaT
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,NaT
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,NaT


In [267]:
exit_uselog = exit_uselog.dropna(subset = ['name'])
print(len(exit_uselog))
print(len(exit_uselog['customer_id'].unique()))
exit_uselog.head()

1104
1104


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
19,201805,AS055680,3,3.0,XXXXX,C01,M,2018-03-01,2018-06-30,CA1,...,10500.0,通常,3.0,3.0,3.0,3.0,0.0,2018-06-30,3.0,2018-05-30
57,201805,AS169823,2,3.0,XX,C01,M,2017-11-01,2018-06-30,CA1,...,10500.0,通常,3.0,3.0,4.0,2.0,1.0,2018-06-30,7.0,2018-05-30
110,201805,AS305860,5,3.0,XXXX,C01,M,2017-06-01,2018-06-30,CA1,...,10500.0,通常,3.333333,3.0,5.0,2.0,0.0,2018-06-30,12.0,2018-05-30
128,201805,AS363699,5,3.0,XXXXX,C01,M,2018-02-01,2018-06-30,CA1,...,10500.0,通常,3.333333,3.0,5.0,2.0,0.0,2018-06-30,4.0,2018-05-30
147,201805,AS417696,1,4.0,XX,C03,F,2017-09-01,2018-06-30,CA1,...,6000.0,通常,2.0,1.0,4.0,1.0,0.0,2018-06-30,9.0,2018-05-30


### ノック43：継続顧客のデータを作成しよう

In [268]:
conti_customer = customer.loc[customer['is_deleted'] == 0]
conti_uselog = pd.merge(uselog, conti_customer, on = ['customer_id'], how = 'left')
print(len(conti_uselog))
conti_uselog = conti_uselog.dropna(subset = ['name'])
print(len(conti_uselog))

33851
27422


In [269]:
#データのシャッフル
conti_uselog = conti_uselog.sample(frac = 1).reset_index(drop = True)
#重複データは最初のみを取得する
conti_uselog = conti_uselog.drop_duplicates(subset = 'customer_id')
print(len(conti_uselog))
conti_uselog.head()

2842


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201901,AS946582,4,4.0,XXXXXX,C02,F,2016-01-01,,CA1,...,デイタイム,7500.0,通常,4.916667,5.0,7.0,2.0,1.0,2019-04-30,39.0
1,201810,AS790905,5,4.0,XXX,C03,M,2016-01-01,,CA1,...,ナイト,6000.0,通常,4.166667,4.0,7.0,2.0,1.0,2019-04-30,39.0
2,201808,HD745359,8,7.0,XXXXXX,C02,M,2018-03-01,,CA1,...,デイタイム,7500.0,通常,7.916667,7.5,10.0,6.0,1.0,2019-04-30,13.0
3,201806,OA776086,3,2.0,XXX,C03,F,2015-06-01,,CA1,...,ナイト,6000.0,通常,3.5,3.5,5.0,2.0,1.0,2019-04-30,46.0
4,201812,HI963277,6,3.0,XXX,C03,M,2015-08-01,,CA1,...,ナイト,6000.0,通常,3.833333,4.0,6.0,2.0,1.0,2019-04-30,44.0


In [270]:
predict_data = pd.concat([conti_uselog, exit_uselog], ignore_index = True)
print(len(predict_data))
predict_data.head()

3946


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201901,AS946582,4,4.0,XXXXXX,C02,F,2016-01-01,,CA1,...,7500.0,通常,4.916667,5.0,7.0,2.0,1.0,2019-04-30,39.0,NaT
1,201810,AS790905,5,4.0,XXX,C03,M,2016-01-01,,CA1,...,6000.0,通常,4.166667,4.0,7.0,2.0,1.0,2019-04-30,39.0,NaT
2,201808,HD745359,8,7.0,XXXXXX,C02,M,2018-03-01,,CA1,...,7500.0,通常,7.916667,7.5,10.0,6.0,1.0,2019-04-30,13.0,NaT
3,201806,OA776086,3,2.0,XXX,C03,F,2015-06-01,,CA1,...,6000.0,通常,3.5,3.5,5.0,2.0,1.0,2019-04-30,46.0,NaT
4,201812,HI963277,6,3.0,XXX,C03,M,2015-08-01,,CA1,...,6000.0,通常,3.833333,4.0,6.0,2.0,1.0,2019-04-30,44.0,NaT


### ノック44：予測する月の在籍期間を作成しよう

In [271]:
predict_data['period'] = 0
predict_data['now_date'] = pd.to_datetime(predict_data['年月'], format = '%Y%m')
predict_data['start_date'] = pd.to_datetime(predict_data['start_date'])
for i in range(len(predict_data)):
    delta = relativedelta(predict_data['now_date'][i], predict_data['start_date'][i])
    predict_data['period'][i] = int(delta.years * 12 + delta.months)
predict_data.head()

Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,now_date
0,201901,AS946582,4,4.0,XXXXXX,C02,F,2016-01-01,,CA1,...,4.916667,5.0,7.0,2.0,1.0,2019-04-30,39.0,NaT,36,2019-01-01
1,201810,AS790905,5,4.0,XXX,C03,M,2016-01-01,,CA1,...,4.166667,4.0,7.0,2.0,1.0,2019-04-30,39.0,NaT,33,2018-10-01
2,201808,HD745359,8,7.0,XXXXXX,C02,M,2018-03-01,,CA1,...,7.916667,7.5,10.0,6.0,1.0,2019-04-30,13.0,NaT,5,2018-08-01
3,201806,OA776086,3,2.0,XXX,C03,F,2015-06-01,,CA1,...,3.5,3.5,5.0,2.0,1.0,2019-04-30,46.0,NaT,36,2018-06-01
4,201812,HI963277,6,3.0,XXX,C03,M,2015-08-01,,CA1,...,3.833333,4.0,6.0,2.0,1.0,2019-04-30,44.0,NaT,40,2018-12-01


### ノック45：欠損値を除去しよう

In [272]:
#isnullとisnaは同じ
#pandasの使用によって２つ作られた（pdはRをもとにしていてそのRに２つあるから）
predict_data.isna().sum()

年月                      0
customer_id             0
count_0                 0
count_1               262
name                    0
class                   0
gender                  0
start_date              0
end_date             2842
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2842
period                  0
now_date                0
dtype: int64

In [273]:
predict_data = predict_data.dropna(subset = ['count_1'])
predict_data.isna().sum()

年月                      0
customer_id             0
count_0                 0
count_1                 0
name                    0
class                   0
gender                  0
start_date              0
end_date             2632
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2632
period                  0
now_date                0
dtype: int64

### ノック46：文字列型の変数を処理できるように整形しよう

In [274]:
target_col = ['campaign_name', 'class_name', 'gender', 'count_1', 'routine_flg', 'period', 'is_deleted']
predict_data = predict_data[target_col]
predict_data.head()

Unnamed: 0,campaign_name,class_name,gender,count_1,routine_flg,period,is_deleted
0,通常,デイタイム,F,4.0,1.0,36,0.0
1,通常,ナイト,M,4.0,1.0,33,0.0
2,通常,デイタイム,M,7.0,1.0,5,0.0
3,通常,ナイト,F,2.0,1.0,36,0.0
4,通常,ナイト,M,3.0,1.0,40,0.0


In [275]:
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_入会費半額,campaign_name_入会費無料,campaign_name_通常,class_name_オールタイム,class_name_デイタイム,class_name_ナイト,gender_F,gender_M
0,4.0,1.0,36,0.0,0,0,1,0,1,0,1,0
1,4.0,1.0,33,0.0,0,0,1,0,0,1,0,1
2,7.0,1.0,5,0.0,0,0,1,0,1,0,0,1
3,2.0,1.0,36,0.0,0,0,1,0,0,1,1,0
4,3.0,1.0,40,0.0,0,0,1,0,0,1,0,1


In [276]:
del predict_data['campaign_name_通常']
del predict_data['class_name_ナイト']
del predict_data['gender_M']
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_入会費半額,campaign_name_入会費無料,class_name_オールタイム,class_name_デイタイム,gender_F
0,4.0,1.0,36,0.0,0,0,0,1,1
1,4.0,1.0,33,0.0,0,0,0,0,0
2,7.0,1.0,5,0.0,0,0,0,1,0
3,2.0,1.0,36,0.0,0,0,0,0,1
4,3.0,1.0,40,0.0,0,0,0,0,0


### ノック47：決定木を用いて退会予測モデルを作成してみよう

In [277]:
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

exit = predict_data.loc[predict_data['is_deleted'] == 1]
conti = predict_data.loc[predict_data['is_deleted'] == 0].sample(len(exit))

X = pd.concat([exit, conti], ignore_index = True)
y = X['is_deleted']
del X['is_deleted']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)

In [278]:
model = DecisionTreeClassifier(random_state = 0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)

[1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1.
 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.
 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1.
 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1.
 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1.
 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0.
 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1.
 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.

In [279]:
results_test = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
results_test.head()

Unnamed: 0,y_test,y_pred
657,1.0,1.0
981,1.0,1.0
1785,0.0,1.0
1048,1.0,1.0
523,1.0,1.0


### ノック48：予測モデルの評価を行ない、モデルのチューニングをしてみよう

In [280]:
#全体と予測が一致した数を計算してその割合をだす
correct = len(results_test.loc[results_test['y_test'] == results_test['y_pred']])
data_count = len(results_test)
score_test = correct / data_count
print(score_test)

0.8973384030418251


In [281]:
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.8973384030418251
0.9778200253485425


In [282]:
X = pd.concat([exit, conti], ignore_index = True)
y = X['is_deleted']
del X['is_deleted']
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = DecisionTreeClassifier(random_state = 0, max_depth = 5)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9201520912547528
0.9302915082382763


### ノック49：モデルに寄与している変数を確認しよう

In [283]:
importance = pd.DataFrame({'feature_names': X.columns, 'coefficient': model.feature_importances_})
importance

Unnamed: 0,feature_names,coefficient
0,count_1,0.38854
1,routine_flg,0.123109
2,period,0.487396
3,campaign_name_入会費半額,0.0
4,campaign_name_入会費無料,0.0
5,class_name_オールタイム,0.0
6,class_name_デイタイム,0.000837
7,gender_F,0.000118


### ノック50：顧客の退会を予測しよう

In [300]:
count_1 = 3
routine_flg = 1
period = 13
campaign_name = "入会費無料"
class_name = "オールタイム"
gender = "M"

In [301]:
if campaign_name == "入会費半額":
    campaign_name_list = [1, 0]
elif campaign_name == "入会費無料":
    campaign_name_list = [0, 1]
elif campaign_name == "通常":
    campaign_name_list = [0, 0]
if class_name == "オールタイム":
    class_name_list = [1, 0]
elif class_name == "デイタイム":
    class_name_list = [0, 1]
elif class_name == "ナイト":
    class_name_list = [0, 0]
if gender == "F":
    gender_list = [1]
elif gender == "M":
    gender_list = [0]

In [302]:
input_data = [count_1, routine_flg, period]
input_data.extend(campaign_name_list)
input_data.extend(class_name_list)
input_data.extend(gender_list)

In [303]:
print(model.predict([input_data]))
print(model.predict_proba([input_data]))

[1.]
[[0.3 0.7]]
