In [116]:
from __future__ import print_function
import numpy as np
import pandas as pd
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
from anchor import anchor_tabular
from sklearn.model_selection import train_test_split
import re
import copy
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [117]:
import random
import pandas as pd
import matplotlib.pyplot as plt 

In [118]:
## 定義一個 class
class Bunch(object):
    """bla"""
    def __init__(self, adict):
        self.__dict__.update(adict)
dataset = Bunch({})

In [119]:
# 原本的 code
seed = 42
random.seed(seed)

df = pd.read_csv('BankChurners.csv')
df = df[df.columns[:-2]]
df = df.fillna(0)
df['Attrition_Flag'] = df['Attrition_Flag'].replace({'Attrited Customer':1, 'Existing Customer':0})
df['Gender'] = df['Gender'].replace({'M':1, 'F':0})
df = df.drop('CLIENTNUM',axis=1)
df = pd.get_dummies(data=df, columns=['Education_Level', 'Marital_Status', 
                                      'Income_Category', 'Card_Category']) # one-hot encoding


In [120]:
churn_index = df.index[df.Attrition_Flag == 1].tolist()
non_churn_index = df.index[df.Attrition_Flag == 0].tolist()
non_churn_index = random.sample(non_churn_index, len(churn_index)) # 讓 churn : non_churn 資料個數為 1 : 1
new_idx = churn_index + non_churn_index
df = df.loc[new_idx]

In [121]:
# 把原本的 pandas.DataFrame 轉成 np.array 
y = df['Attrition_Flag'].to_numpy()
df_no_y = df.drop('Attrition_Flag', axis=1)
x = df_no_y.to_numpy()
print(y.shape)
print(x.shape)

(3254,)
(3254, 36)


In [122]:
# 模型預測
c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=100)
c.fit(train_x, train_y)
print('Train', sklearn.metrics.accuracy_score(train_y, c.predict(train_x)))
print('Test', sklearn.metrics.accuracy_score(test_y, c.predict(test_x)))

Train 1.0
Test 0.9416282642089093


In [123]:
# 定義 anchor 中的參數
dataset.train = train_x # 訓練資料 (numpy)
dataset.test = test_x  # 測試資料 (numpy)
dataset.categorical_names = {} # label encoding前的內容 (dict) 
dataset.class_names = ['Existing Customer','Attrited Customer'] # label的名稱 (list) 
dataset.feature_names = np.array(df_no_y.columns.to_list()) # feature的名稱 (numpy)

In [124]:
# anchor explainer 
explainer = anchor_tabular.AnchorTabularExplainer(
    dataset.class_names,
    dataset.feature_names,
    dataset.train,
    dataset.categorical_names)

In [125]:
dataset.train.shape

(2603, 36)

In [128]:
'''
input 內容:
    1. start -> 從哪一個客戶開始找 anchor
    2. number -> 要從幾個客戶身上找 anchor
    3. dataset -> 訓練模型時使用的 dataset
    4. df -> 原始表格
    5. model -> 要解釋的模型
    6. seed -> 亂數種子(預設為1) 


重要變數:
    1. anchors_list : 所有 anchor 集合
    2. anchors : 做過 regular expression 的所有 anchor 集合
    3. anchors_info : anchor 詳細資料  
        (1) anchors 規則
        (2) anchors 所包含的客戶數量(原本兆豐所有客戶中所包含的客戶數量)
        (3) anchors 信心水準 
    4. potential_customer : 潛在客戶名單
    
'''
def customer_list(dataset, df, model,number=float('inf'), seed = 1):
    
    # 設定種子
    np.random.seed(seed)

    # 製作 Anchors 解釋器
    explainer = anchor_tabular.AnchorTabularExplainer(
            dataset.class_names,
            dataset.feature_names,
            dataset.train,
            dataset.categorical_names)

    # 變數定義
    potential_customer = pd.DataFrame() # 潛在客戶名單
    anchors_list = [] # anchors 規則清單
    anchors_list_confidence = [] # anchors 規則信心水準清單
    anchors = [] # 經過 regular expression 的 anchors 規則
    anchors_info = [] # anchors 詳細清單
    
    # dicts
    dicts = {}
    for i in range(len(dataset.feature_names)):
        dicts[dataset.feature_names[i]] = i

    # 從 anchors 抓取資訊
    current_number = 0
    data = np.vstack([dataset.test,dataset.train])

    while(current_number < number):
        if (data.shape[0] == 0):
            break
        i = data[0]
        cond = [True]*data.shape[0]
        print(current_number," : ", data.shape[0])
        if(model.predict(i.reshape(1, -1))[0]):
            exp = explainer.explain_instance(i, model.predict, threshold=0.90)
            str_split = [re.split('( <= | >= | < | > )',j) for j in exp.names()]
            
            for idx,vle in enumerate(str_split):
                if len(vle) == 5:
                    vle[1] = vle[1].replace('>','<').replace('<','>')   
                    str_split.append(vle[2:])
                    str_split.append(vle[:3][::-1])
                    del str_split[idx]
            
            for ii in str_split:
                if (ii[1] == ' >= '):
                    idx = data[:,dicts[ii[0]]] >= float(ii[2])
                elif (ii[1] == ' <= '):
                    idx = data[:,dicts[ii[0]]] <= float(ii[2])
                elif (ii[1] == ' > '):
                    idx = data[:,dicts[ii[0]]] > float(ii[2])
                elif (ii[1] == ' < '):
                    idx = data[:,dicts[ii[0]]] < float(ii[2])
                cond = cond & idx  
            cond = np.logical_not(cond)  
            data = data[cond]  
            
            anchors.append(str_split)
            anchors_list.append(exp.names()) # anchors 結果
            anchors_list_confidence.append(exp.precision()) # anchors 精確度
            current_number += 1

        else: 
            data = np.delete(data ,0 ,0)

    for i,j in enumerate(anchors_list):
            anchors_info.append(
                {
                    "condition" : j,
                    "Quantity covered" : 0,
                    "confidence interval" : anchors_list_confidence[i]
                }
            ) 
   
  
    ## 從找到的規則中尋找潛在客戶  
    '''
    演算法 :
    
    1. 從全部的 anchors 中，取出一個 anchor，後續步驟以下分述之:

        (1) 根據該 anchor 中的條件取出一個條件進行篩選，並將篩選出是否符合客戶資料之結果，放入 idx 變數中。
        (2) 使用 idx 變數篩選符合該條件的客戶，存到 temp_df變數中。若有其他條件需要篩選 -> 回到(1)；否則則進到(3)。
        (3) 將 temp_df 中 label 為 0 (非目前客戶)的客戶，放入潛在客戶名單。若有其他 anchor 未計算 -> 回到 1.；否則則進到 2.。 
    
    2. 將資料輸出，輸出內容以下分述之:
        
        (1) anchor 名單
        (2) 潛在客戶名單
    '''

    for n,anchor in enumerate(anchors):
        
        temp_df = df 
        for i in anchor:
            if (i[1] == ' >= '):
                idx = temp_df.loc[:,i[0]] >= float(i[2])
            elif (i[1] == ' <= '):
                idx = temp_df.loc[:,i[0]] <= float(i[2])
            elif (i[1] == ' > '):
                idx = temp_df.loc[:,i[0]] > float(i[2])
            elif (i[1] == ' < '):
                idx = temp_df.loc[:,i[0]] < float(i[2])
            else:
                continue
             
            temp_df = temp_df[idx]
            anchors_info[n]['Quantity covered'] = temp_df.shape[0]
            
        else: 
            idx = temp_df.loc[:,'Attrition_Flag'] == 0
            temp_df = temp_df[idx] 
            potential_customer = pd.concat([potential_customer,temp_df], join='outer')
            print(potential_customer.shape[0])

    potential_customer.drop_duplicates()
    
    with open("anchors.json", "w", encoding='utf-8') as f:
        json.dump(anchors_list, f, indent = 4)

    with open("anchors_info.json", "w", encoding='utf-8') as f:
        json.dump(anchors_info, f, indent = 4)

    potential_customer.to_excel("potential_customer.xlsx")
    
    return(potential_customer)
    

In [129]:
# anchor 
number = 10
print(customer_list(dataset, df, c))

0  :  3260
1  :  3179
1  :  3178
1  :  3177
1  :  3176
1  :  3175
2  :  2807
3  :  2490
3  :  2489
3  :  2488
4  :  2134
5  :  2034
5  :  2033
5  :  2032
5  :  2031
5  :  2030
6  :  1997
7  :  1960
7  :  1959
8  :  1953
8  :  1952
8  :  1951
8  :  1950
8  :  1949
8  :  1948
8  :  1947
8  :  1946
8  :  1945
8  :  1944
8  :  1943
8  :  1942
9  :  1893
10  :  1848
10  :  1847
10  :  1846
10  :  1845
11  :  1822
11  :  1821
11  :  1820
12  :  1812
12  :  1811
13  :  1789
13  :  1788
13  :  1787
14  :  1748
14  :  1747
14  :  1746
14  :  1745
14  :  1744
14  :  1743
14  :  1742
14  :  1741
14  :  1740
14  :  1739
14  :  1738
14  :  1737
14  :  1736
14  :  1735
15  :  1728
15  :  1727
15  :  1726
15  :  1725
15  :  1724
15  :  1723
15  :  1722
15  :  1721
15  :  1720
15  :  1719
15  :  1718
15  :  1717
15  :  1716
16  :  1700
16  :  1699
16  :  1698
17  :  1679
17  :  1678
17  :  1677
17  :  1676
17  :  1675
17  :  1674
17  :  1673
17  :  1672
17  :  1671
17  :  1670
17  :  1669
18  :  1639


In [None]:
myArray = np.array([[1, 2, 3, 4, 5], [11, 12, 13, 14, 15], [21, 22, 23, 24, 25]])


modifiedArray = np.delete(myArray, [0,1,2], 0)
#print(modifiedArray)
modifiedArray.shape

(0, 5)

In [None]:
int(10) < float('inf')

True