In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_formats = {'jpg', 'retina'}

In [None]:
sns.set(style="darkgrid")
sns.set_context("paper")

In [None]:
df_bank = pd.read_csv("data/bank_marketing_train.csv")

In [None]:
df_bank.head()

In [None]:
# 欠損値の確認
df_bank.isnull().sum()

In [None]:
# 年齢を10歳ごとに区切ってグループ化

df_bank["age"] = df_bank["age"] // 10
df_bank.head()

In [None]:
# age をカテゴリ変数化する
df_bank['age'] = df_bank['age'].astype(object)
df_bank.head()

In [None]:
df_bank['previous'].value_counts()

In [None]:
# previous の回数が1以上のデータが少ないので、0,1,2...の量的データではなく
# 以前の接触あり・なしの質的データに変換

df_bank["previous"] = df_bank["previous"].map({0:"no", 1:"yes", 2:"yes", 3:"yes", 4:"yes", 5:"yes", 6:"yes", 7:"yes"})
df_bank.head()

In [None]:
# カテゴリカル変数をダミー変数に一括変換
df_bank = pd.get_dummies(df_bank)

In [None]:
df_bank.head()

In [None]:
# 目的変数yが2列になってしまったので片方除外。知りたいのは yes かどうかなので y_no を消す
# 同じく previous_no も消す
df_bank.drop(columns=['y_no', 'previous_no'], inplace=True)

In [None]:
df_bank.head()

In [None]:
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids

In [None]:
exclude_cols = ["y_yes",'age_9', 'job_unknown', 'marital_unknown', 'education_unknown',
                'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed',
                'default_unknown','housing_unknown','loan_unknown','contact_cellular','poutcome_nonexistent']
feature_cols = []
for col in df_bank.columns:
    if col not in exclude_cols:
        feature_cols.append(col)
        
y = df_bank["y_yes"] # 目的変数
X = df_bank[feature_cols]  # 説明変数

In [None]:
feature_cols

In [None]:
df_bank[feature_cols].head()

In [None]:
# Xとyの数
print('Original dataset shape %s' % Counter(y))

In [None]:
# 学習データを70%(X_train, y_train)、テストデータを30%(X_val, y_val)に分割にする
X_train, X_val, y_train, y_val = \
    train_test_split(X, y, test_size=0.3, random_state=1234)
print('Sampled dataset shape %s' % Counter(y_train))

In [None]:
# アンダーサンプリングで
X_cc, y_cc = ClusterCentroids(random_state=1234).fit_sample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_cc))

In [None]:
# SMOTEでオーバーサンプリングを行ない X_train, y_train の正誤を同数に調整
sm = SMOTE(random_state=1234)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_res))

In [None]:
# SMOTEENNでアンダーサンプリングとオーバーサンプリングを同時に行なった場合
X_smoteenn, y_smoteenn = SMOTEENN(random_state=1234).fit_sample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_smoteenn))

## ロジスティック回帰

In [None]:
# アンダーサンプリング適用時のモデル
lr3 = LogisticRegression(random_state=1234)
lr3.fit(X_cc, y_cc)
y_pred_lr3 = lr3.predict(X_val)

# 正解率と混同行列
print('Confusion matrix(test):\n{}'.format(confusion_matrix(y_val, y_pred_lr3)))
print('Accuracy(test) : %.5f' %accuracy_score(y_val, y_pred_lr3))

# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_val, y_pred_lr3).ravel()
print('precision : %.4f'%(tp / (tp + fp)))
print('recall    : %.4f'%(tp / (tp + fn)))
print("f-measure : %.4f"%(2 * (tp / (tp + fp))*(tp / (tp + fn)) / ((tp / (tp + fp)) + (tp / (tp + fn)))))

In [None]:
# アンダーサンプリング時のROC曲線の描画の準備
from sklearn.metrics import roc_curve, auc

# 偽陽性率、真陽性率の取得
fpr, tpr, thresholds = roc_curve(y_val,y_pred_lr3,drop_intermediate=False)

# AUC算出
auc = auc(fpr, tpr)

# 描画
plt.plot(fpr, tpr, color="purple", label="ROC curve (area = %.3f)" % auc)
plt.plot([0,1],[0,1], color="black", linestyle="--")

plt.xlabel("Faulse Psitive Rate")
plt.ylabel("True Psitive Rate")
plt.legend(loc="best")

In [None]:
# オーバーサンプリングを適用したデータによるモデル作成
lr = LogisticRegression(random_state=1234)
lr.fit(X_train_res, y_train_res)

In [None]:
# 予測値算出
y_pred_lr = lr.predict(X_val)

In [None]:
# 正解率と混同行列
print('Confusion matrix(test):\n{}'.format(confusion_matrix(y_val, y_pred_lr)))
print('Accuracy(test) : %.5f' %accuracy_score(y_val, y_pred_lr))

In [None]:
# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_val, y_pred_lr).ravel()
print('precision : %.4f'%(tp / (tp + fp)))
print('recall    : %.4f'%(tp / (tp + fn)))
print("f-measure : %.4f"%(2 * (tp / (tp + fp))*(tp / (tp + fn)) / ((tp / (tp + fp)) + (tp / (tp + fn)))))

In [None]:
# オーバーサンプリング時のROC曲線の描画の準備
from sklearn.metrics import roc_curve, auc

# 偽陽性率、真陽性率の取得
fpr, tpr, thresholds = roc_curve(y_val,y_pred_lr)

# AUC算出
auc = auc(fpr, tpr)

# 描画
plt.plot(fpr, tpr, color="purple", label="ROC curve (area = %.3f)" % auc)
plt.plot([0,1],[0,1], color="black", linestyle="--")

plt.xlabel("Faulse Psitive Rate")
plt.ylabel("True Psitive Rate")
plt.legend(loc="best")

In [None]:
# 混合リサンプリングを適用したモデル作成
lr2 = LogisticRegression(random_state=1234)
lr2.fit(X_smoteenn, y_smoteenn)
y_pred_lr2 = lr2.predict(X_val)

In [None]:
# 正解率と混同行列
print('Confusion matrix(test):\n{}'.format(confusion_matrix(y_val, y_pred_lr2)))
print('Accuracy(test) : %.5f' %accuracy_score(y_val, y_pred_lr2))

In [None]:
# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_val, y_pred_lr2).ravel()
print('precision : %.4f'%(tp / (tp + fp)))
print('recall    : %.4f'%(tp / (tp + fn)))
print("f-measure : %.4f"%(2 * (tp / (tp + fp))*(tp / (tp + fn)) / ((tp / (tp + fp)) + (tp / (tp + fn)))))

In [None]:
# 混合リサンプリング時のROC曲線の描画の準備
from sklearn.metrics import roc_curve, auc

# 偽陽性率、真陽性率の取得
fpr, tpr, thresholds = roc_curve(y_val,y_pred_lr2,drop_intermediate=False)

# AUC算出
auc = auc(fpr, tpr)

# 描画
plt.plot(fpr, tpr, color="purple", label="ROC curve (area = %.3f)" % auc)
plt.plot([0,1],[0,1], color="black", linestyle="--")

plt.xlabel("Faulse Psitive Rate")
plt.ylabel("True Psitive Rate")
plt.legend(loc="best")

In [None]:
prob = lr2.predict_proba(X_val)
precision, recall, thresholds = precision_recall_curve(y_val, prob[:,1], pos_label=1)

In [None]:
precision_recall_report = pd.DataFrame({
    'precision':precision[:-1],
    'recall':recall[:-1],
    'threshold':thresholds},)
precision_recall_report[(precision_recall_report.threshold > 0.1 )&(precision_recall_report.threshold < 0.13)]

## テストデータへの適用

In [None]:
df_bank_test = pd.read_csv("data/bank_marketing_test.csv")

In [None]:
df_bank_test.head()

In [None]:
df_bank_test["age"] = df_bank_test["age"] // 10
df_bank_test['age'] = df_bank_test['age'].astype(object)
df_bank_test["previous"] = df_bank_test["previous"].map(
    {0:"no", 1:"yes", 2:"yes", 3:"yes", 4:"yes", 5:"yes", 6:"yes", 7:"yes"})
df_bank_test = pd.get_dummies(df_bank_test)
df_bank_test.drop(columns=['y_no', 'previous_no'], inplace=True)

exclude_cols = ["y_yes",'age_9', 'job_unknown', 'marital_unknown', 'education_unknown',
                'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed',
                'default_unknown','housing_unknown','loan_unknown','contact_cellular','poutcome_nonexistent']
feature_cols = []
for col in df_bank_test.columns:
    if col not in exclude_cols:
        feature_cols.append(col)
        
y = df_bank_test["y_yes"] # 目的変数
X = df_bank_test[feature_cols]  # 説明変数

In [None]:
y_pred_lr_test = lr2.predict(X)

In [None]:
# 正解率と混同行列
print('Confusion matrix(test):\n{}'.format(confusion_matrix(y, y_pred_lr_test)))
print('Accuracy(test) : %.5f' %accuracy_score(y, y_pred_lr_test))

# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y, y_pred_lr_test).ravel()
print('precision : %.4f'%(tp / (tp + fp)))
print('recall    : %.4f'%(tp / (tp + fn)))
print("f-measure : %.4f"%(2 * (tp / (tp + fp))*(tp / (tp + fn)) / ((tp / (tp + fp)) + (tp / (tp + fn)))))

In [None]:
prob = lr2.predict_proba(X)
precision, recall, thresholds = precision_recall_curve(y, prob[:,1], pos_label=1)

In [None]:
precision_recall_report = pd.DataFrame({
    'precision':precision[:-1],
    'recall':recall[:-1],
    'threshold':thresholds},)
precision_recall_report

In [None]:
# 閾値を0.1にずらす
y_pred_lr_test_new_threshold = (prob[:,1] >= 0.1).astype(bool)

# 正解率と混同行列
print('Confusion matrix(test):\n{}'.format(confusion_matrix(y, y_pred_lr_test_new_threshold)))
print('Accuracy(test) : %.5f' %accuracy_score(y, y_pred_lr_test_new_threshold))

# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y, y_pred_lr_test_new_threshold).ravel()
print('precision : %.4f'%(tp / (tp + fp)))
print('recall    : %.4f'%(tp / (tp + fn)))
print("f-measure : %.4f"%(2 * (tp / (tp + fp))*(tp / (tp + fn)) / ((tp / (tp + fp)) + (tp / (tp + fn)))))

In [None]:
ROI = 5000 * tp - 500 * (tp + fp)
ROI

In [None]:
# テストデータ適用時のROC曲線の描画の準備
from sklearn.metrics import roc_curve, auc

# 偽陽性率、真陽性率の取得
fpr, tpr, thresholds = roc_curve(y,y_pred_lr_test,drop_intermediate=False)

# AUC算出
auc = auc(fpr, tpr)

# 描画
plt.plot(fpr, tpr, color="purple", label="ROC curve (area = %.3f)" % auc)
plt.plot([0,1],[0,1], color="black", linestyle="--")

plt.xlabel("Faulse Psitive Rate")
plt.ylabel("True Psitive Rate")
plt.legend(loc="best")