In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('customer_churn.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
#删去不用的特征
X = df.loc[:,['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']]
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [4]:
y = df.Exited
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [5]:
#将类别信息转化为数值
#转化了Geography和Gender，使用的是fit_transform 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder1= LabelEncoder()
X.Geography= labelencoder1.fit_transform(X.Geography)
labelencoder2= LabelEncoder()
X.Gender = labelencoder2.fit_transform(X.Gender)
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.8,3,1,0,113931.57
3,699,0,0,39,1,0.0,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.1


In [6]:
#去除数据之间的比较序列关系
onehotencoder = OneHotEncoder(categorical_features = [1])
X= onehotencoder.fit_transform(X).toarray()
X[0]

array([1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.1900000e+02,
       0.0000000e+00, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05])

In [7]:
# 解决 “虚拟变量陷阱”（dummy variable trap）
#删掉第一列
X= np.delete(X, [0], 1)
X[0]

array([0.0000000e+00, 0.0000000e+00, 6.1900000e+02, 0.0000000e+00,
       4.2000000e+01, 2.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0134888e+05])

In [8]:
#y 转化为列向量
y= y[:, np.newaxis]
y

array([[1],
       [0],
       [1],
       ...,
       [1],
       [1],
       [0]], dtype=int64)

In [9]:
onehotencoder = OneHotEncoder()
y= onehotencoder.fit_transform(y).toarray()
y
# 一列是存留的顾客 一列是流失的

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2, random_state = 0)
len(X_train)

8000

In [11]:
len(X_test)

2000

In [12]:
#数据的标准化
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#注意，我们只对特征矩阵做标准化，标签是不能动的。
#另外训练集和测试集需要按照统一的标准变化。
#所以你看，训练集上，我们用了fit_transform函数，先拟合后转换；
#而在测试集上，我们直接用训练集拟合的结果，只做转换。
X_train

array([[-0.5698444 ,  1.74309049,  0.16958176, ...,  0.64259497,
        -1.03227043,  1.10643166],
       [ 1.75486502, -0.57369368, -2.30455945, ...,  0.64259497,
         0.9687384 , -0.74866447],
       [-0.5698444 , -0.57369368, -1.19119591, ...,  0.64259497,
        -1.03227043,  1.48533467],
       ...,
       [-0.5698444 , -0.57369368,  0.9015152 , ...,  0.64259497,
        -1.03227043,  1.41231994],
       [-0.5698444 ,  1.74309049, -0.62420521, ...,  0.64259497,
         0.9687384 ,  0.84432121],
       [ 1.75486502, -0.57369368, -0.28401079, ...,  0.64259497,
        -1.03227043,  0.32472465]])

In [13]:
#决策树
from sklearn import tree
clf= tree.DecisionTreeClassifier()
clf= clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [14]:
#分析
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.89      0.86      0.87      1595
          1       0.50      0.57      0.53       405

avg / total       0.81      0.80      0.80      2000



In [15]:
#深度学习
import tflearn 
net= tflearn.input_data(shape=[None, 11])
net= tflearn.fully_connected(net, 6, activation="relu")
net= tflearn.fully_connected(net, 6, activation="relu")
net= tflearn.fully_connected(net, 6, activation="relu")
#工程界的一种做法，是把输入层的神经元数量，加上输出层神经元数量，除以2取整。
net= tflearn.fully_connected(net, 2, activation='softmax')
net= tflearn.regression(net)
model = tflearn.DNN(net)
model.fit(X_train, y_train, n_epoch=30, batch_size=32, show_metric=True)

Training Step: 7499  | total loss: [1m[32m0.42247[0m[0m | time: 0.850s
| Adam | epoch: 030 | loss: 0.42247 - acc: 0.8470 -- iter: 7968/8000
Training Step: 7500  | total loss: [1m[32m0.40428[0m[0m | time: 0.854s
| Adam | epoch: 030 | loss: 0.40428 - acc: 0.8561 -- iter: 8000/8000
--


In [16]:
import os  
os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级，显示所有信息  
os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error   
os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error  


In [21]:
X_test[0]

array([ 1.75486502, -0.57369368, -0.55204276, -1.09168714, -0.36890377,
        1.04473698,  0.8793029 , -0.92159124,  0.64259497,  0.9687384 ,
        1.61085707])

In [24]:
y_pred = model.predict(X_test)
y_pred[0]

array([0.743274, 0.256726], dtype=float32)

In [25]:
y_test[0]

array([1., 0.])

In [27]:
score= model.evaluate(X_test, y_test)
print('Test accuarcy: %0.4f%%' % (score[0] * 100))

Test accuarcy: 84.2500%


In [None]:
# 打不开tensorboard 
#不知道为什么
