# 1、项目背景
国内通信市场逐渐成熟，三大运营商营销模式业务趋同，竞争压力比较大，高新增用户已经成为过去时，用户流失率提高已经成为普遍现象。在通信市场上，联通面临着移动和电信的挑战，如何提高用户的满意度，降低流失率成为了主要问题。

![](./运营商数据.png)

# 2、需求拆解
用户流失的原因：有可能是信号不稳定，网速慢，价格不划算
交付：每周提交一次用户流失率比较大的人员名单（置信概率）

# 3、收集数据
价格不划算：套餐价格（对比竞价）
超出套餐部分的通话费用比较贵：对即将超出套餐的提醒，并合理推荐套餐
超出套餐部分的流量费用比较贵：对即将超出套餐的提醒，并合理推荐套餐

In [31]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [10]:
data = pd.read_excel('./联通用户流失数据/CustomerSurvival.xlsx')
data.head()

Unnamed: 0,ID,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,1,792.833333,-10.450067,0,0,0,0,25,0
1,2,1,121.666667,-21.141117,0,0,0,0,25,0
2,3,1,-30.0,-25.655273,0,0,0,0,2,1
3,4,1,241.5,-288.341254,0,1,0,1,25,0
4,5,1,1629.666667,-23.655505,0,0,0,1,25,0


# 4、数据清晰、数据预处理(特征工程)

### 缺失值

In [12]:
# 缺失值
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4975 entries, 0 to 4974
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      4975 non-null   int64  
 1   套餐金额    4975 non-null   int64  
 2   额外通话时长  4975 non-null   float64
 3   额外流量    4975 non-null   float64
 4   改变行为    4975 non-null   int64  
 5   服务合约    4975 non-null   int64  
 6   关联购买    4975 non-null   int64  
 7   集团用户    4975 non-null   int64  
 8   使用月数    4975 non-null   int64  
 9   流失用户    4975 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 388.8 KB


### 异常值

In [15]:
# 异常值
data['额外通话时长'].quantile([0, 0.25, 0.5, 0.75, 1])

0.00   -2828.333333
0.25    -126.666667
0.50      13.500000
0.75     338.658333
1.00    4314.000000
Name: 额外通话时长, dtype: float64

In [16]:
(data['额外通话时长'] > 339).sum()

1241

In [17]:
#在业务指标中认定超过3000或者剩余3000为异常值
data = data[data['额外通话时长'] < 3000]
data = data[data['额外通话时长'] > -3000]

In [20]:
#额外流量
q1 = data['额外流量'].quantile(0.25)
q3 = data['额外流量'].quantile(0.75)
IQR = q3 - q1
max_val = q3 + 1.5 * IQR
min_val = q1 - 1.5 * IQR
max_val, min_val
#可见这种指标不合理

(46.94712357559965, -147.03199225116833)

In [22]:
data['额外流量'].max(), data['额外流量'].min()
#可见流量中不存在异常值

(2568.7042927742, -2189.87598559645)

## 特征工程

In [23]:
data  #删除ID列

Unnamed: 0,ID,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,1,792.833333,-10.450067,0,0,0,0,25,0
1,2,1,121.666667,-21.141117,0,0,0,0,25,0
2,3,1,-30.000000,-25.655273,0,0,0,0,2,1
3,4,1,241.500000,-288.341254,0,1,0,1,25,0
4,5,1,1629.666667,-23.655505,0,0,0,1,25,0
...,...,...,...,...,...,...,...,...,...,...
4970,4971,1,1109.333333,49.843215,0,1,0,1,25,0
4971,4972,1,197.833333,-34.987142,0,1,0,0,21,1
4972,4973,1,162.833333,71.369162,0,1,0,0,25,0
4973,4974,1,358.166667,26.315733,0,1,0,0,21,1


In [24]:
del data['ID']  #删除ID列

In [25]:
data

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,792.833333,-10.450067,0,0,0,0,25,0
1,1,121.666667,-21.141117,0,0,0,0,25,0
2,1,-30.000000,-25.655273,0,0,0,0,2,1
3,1,241.500000,-288.341254,0,1,0,1,25,0
4,1,1629.666667,-23.655505,0,0,0,1,25,0
...,...,...,...,...,...,...,...,...,...
4970,1,1109.333333,49.843215,0,1,0,1,25,0
4971,1,197.833333,-34.987142,0,1,0,0,21,1
4972,1,162.833333,71.369162,0,1,0,0,25,0
4973,1,358.166667,26.315733,0,1,0,0,21,1


In [27]:
data_raw = data

In [29]:
data_raw

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,792.833333,-10.450067,0,0,0,0,25,0
1,1,121.666667,-21.141117,0,0,0,0,25,0
2,1,-30.000000,-25.655273,0,0,0,0,2,1
3,1,241.500000,-288.341254,0,1,0,1,25,0
4,1,1629.666667,-23.655505,0,0,0,1,25,0
...,...,...,...,...,...,...,...,...,...
4970,1,1109.333333,49.843215,0,1,0,1,25,0
4971,1,197.833333,-34.987142,0,1,0,0,21,1
4972,1,162.833333,71.369162,0,1,0,0,25,0
4973,1,358.166667,26.315733,0,1,0,0,21,1


In [30]:
# 由于数据整理上离散列多，所以我们使用决策树（分类），将两个连续的特征进行离散化
extra_time_cut = [[-3000, -1000, 0, 1000, 3000], [2, 4, 3, 1]]
data['额外通话时长'] = pd.cut(data['额外通话时长'],
                              bins=[-3000, -1000, 0, 1000, 3000]
                              , labels=[2, 4, 3, 1])
data

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,3,-10.450067,0,0,0,0,25,0
1,1,3,-21.141117,0,0,0,0,25,0
2,1,4,-25.655273,0,0,0,0,2,1
3,1,3,-288.341254,0,1,0,1,25,0
4,1,1,-23.655505,0,0,0,1,25,0
...,...,...,...,...,...,...,...,...,...
4970,1,1,49.843215,0,1,0,1,25,0
4971,1,3,-34.987142,0,1,0,0,21,1
4972,1,3,71.369162,0,1,0,0,25,0
4973,1,3,26.315733,0,1,0,0,21,1


In [32]:
data['额外流量'] = np.where(data['额外流量'] > 0, 2, 1)
data

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,3,1,0,0,0,0,25,0
1,1,3,1,0,0,0,0,25,0
2,1,4,1,0,0,0,0,2,1
3,1,3,1,0,1,0,1,25,0
4,1,1,1,0,0,0,1,25,0
...,...,...,...,...,...,...,...,...,...
4970,1,1,2,0,1,0,1,25,0
4971,1,3,1,0,1,0,0,21,1
4972,1,3,2,0,1,0,0,25,0
4973,1,3,2,0,1,0,0,21,1


# 5、建立模型

In [None]:
# 单颗决策树，Adboost，GBDT，随机森林

In [53]:
import sklearn.tree as st
import sklearn.ensemble as se
import sklearn.model_selection as ms
import sklearn.metrics as sm

In [39]:
#tree = st.DecisionTreeClassifier(max_depth=6)
#trees1 = se.AdaBoostClassifier(tree, n_estimators=100)
#trees2 = se.GradientBoostingClassifier(max_depth=6, n_estimators=400, min_samples_split=2)
#trees3 = se.RandomForestClassifier(max_depth=10, n_estimators=1000, min_samples_split=2,
#                                   class_weight='balanced')  #这里是让样本均衡

In [40]:
tree = st.DecisionTreeClassifier()
trees1 = se.AdaBoostClassifier(tree)
trees2 = se.GradientBoostingClassifier()
trees3 = se.RandomForestClassifier(class_weight='balanced')  #这里是让样本均衡

In [42]:
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [43]:
#单颗树
params = [
    {"max_depth": [5, 6, 7, 8]}
]

tree = st.DecisionTreeClassifier()
model = ms.GridSearchCV(tree, params, cv=5)  # 创建网格搜索对象
model.fit(x, y)  # 训练

print("best_score_:", model.best_score_)
print("best_params_:\n", model.best_params_)

best_score_: 0.9863316582914573
best_params_:
 {'max_depth': 5}


In [45]:
# Adaboost
params = [
    {
        "n_estimators": [100, 200, 300, 400]
    },

]

tree = st.DecisionTreeClassifier(max_depth=5)
trees1 = se.AdaBoostClassifier(tree)

model = ms.GridSearchCV(trees1, params, cv=5)  # 创建网格搜索对象
model.fit(x, y)  # 训练

print("best_score_:", model.best_score_)
print("best_params_:\n", model.best_params_)

best_score_: 0.9851256281407036
best_params_:
 {'n_estimators': 200}


In [47]:
# GBDT
params = [
    {"max_depth": [5, 6, 7, 8],
     "n_estimators": [100, 200, 300, 400]
     },

]
trees2 = se.GradientBoostingClassifier()

model = ms.GridSearchCV(trees2, params, cv=5)  # 创建网格搜索对象
model.fit(x, y)  # 训练

print("best_score_:", model.best_score_)
print("best_params_:\n", model.best_params_)

best_score_: 0.9865326633165828
best_params_:
 {'max_depth': 6, 'n_estimators': 100}


In [48]:
# 随机森林
params = [
    {"max_depth": [5, 6, 7, 8],
     "n_estimators": [100, 200, 300, 400]
     },

]

trees3 = se.RandomForestClassifier(class_weight='balanced')
model = ms.GridSearchCV(trees3, params, cv=5)  # 创建网格搜索对象
model.fit(x, y)  # 训练

print("best_score_:", model.best_score_)
print("best_params_:\n", model.best_params_)

best_score_: 0.9867336683417086
best_params_:
 {'max_depth': 8, 'n_estimators': 100}


In [50]:
train_x, test_x, train_y, test_y = ms.train_test_split(x, y, train_size=0.1, random_state=7)

In [51]:
fm = se.RandomForestClassifier(max_depth=8, n_estimators=100, class_weight='balanced')
fm.fit(train_x, train_y)

In [52]:
pred_y = fm.predict(test_x)

In [54]:
print(sm.classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       993
           1       0.99      0.99      0.99      3485

    accuracy                           0.99      4478
   macro avg       0.98      0.98      0.98      4478
weighted avg       0.99      0.99      0.99      4478



In [58]:
####老师代码
def f(name, model):
    print(name, "wording")
    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    print(sm.classification_report(test_y, pred_y))

In [62]:
model_dic = {'单颗树': st.DecisionTreeClassifier(),
             'Adaboost': se.AdaBoostClassifier(st.DecisionTreeClassifier(), n_estimators=100),
             'GBDT': se.GradientBoostingClassifier(n_estimators=100),
             '随机森林': se.RandomForestClassifier(n_estimators=100)}

In [64]:
for name, obj in model_dic.items():
    f(name, obj)

单颗树 wording
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       993
           1       0.99      0.99      0.99      3485

    accuracy                           0.98      4478
   macro avg       0.97      0.97      0.97      4478
weighted avg       0.98      0.98      0.98      4478

Adaboost wording
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       993
           1       0.99      0.99      0.99      3485

    accuracy                           0.98      4478
   macro avg       0.97      0.97      0.97      4478
weighted avg       0.98      0.98      0.98      4478

GBDT wording
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       993
           1       0.99      0.99      0.99      3485

    accuracy                           0.98      4478
   macro avg       0.98      0.97      0.98      4478
weighted avg       0.98      0.9

# 6、优化模型

In [67]:
sub_model = st.DecisionTreeClassifier()
params = {
    "criterion": ['gini', 'entropy'],
    "max_depth": np.arange(2, 9),
    "min_samples_split": np.arange(2, 21),
    "min_samples_leaf": np.arange(1, 11)
}
sub_GS = ms.GridSearchCV(sub_model, param_grid=params, cv=3)
sub_GS.fit(x, y)

In [68]:
print("best_score_:", sub_GS.best_score_)
print("best_params_:\n", sub_GS.best_params_)

best_score_: 0.9877387732665556
best_params_:
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 10}


In [70]:
main_model = se.AdaBoostClassifier(sub_GS.best_estimator_)
params = {'n_estimators': np.arange(20, 201, 10)}

In [71]:
main_GS = ms.GridSearchCV(main_model, param_grid=params, cv=3)
main_GS.fit(x, y)

In [72]:
print("best_score_:", main_GS.best_score_)
print("best_params_:\n", main_GS.best_params_)

best_score_: 0.9849248642670639
best_params_:
 {'n_estimators': 40}


In [74]:
best_model = main_GS.best_estimator_

In [76]:
pred_best = best_model.predict(test_x)

In [77]:
print(sm.classification_report(test_y, pred_best))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       993
           1       1.00      1.00      1.00      3485

    accuracy                           0.99      4478
   macro avg       0.99      0.99      0.99      4478
weighted avg       0.99      0.99      0.99      4478



# 7、保存部署

In [78]:
import pickle

In [80]:
dict_info = {"数据结构": data.columns[:-1],
             "数据转换": {'额外通话时常': extra_time_cut,
                          '额外流量': {'条件>': 0, 'True': 2, 'False': 1}
                          },
             "模型": best_model
             }

In [81]:
with open('联通用户流失预测模型.pickle', 'wb') as f:
    pickle.dump(dict_info, f)
print("保存成功")

保存成功


# 加载模型

In [83]:
with open('联通用户流失预测模型.pickle', 'rb') as f:
    obj = pickle.load(f)

In [84]:
print(obj)

{'数据结构': Index(['套餐金额', '额外通话时长', '额外流量', '改变行为', '服务合约', '关联购买', '集团用户', '使用月数'], dtype='object'), '数据转换': {'额外通话时常': [[-3000, -1000, 0, 1000, 3000], [2, 4, 3, 1]], '额外流量': {'条件>': 0, 'True': 2, 'False': 1}}, '模型': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=7,
                                                         min_samples_leaf=3,
                                                         min_samples_split=10),
                   n_estimators=40)}


In [85]:
obj['数据结构']

Index(['套餐金额', '额外通话时长', '额外流量', '改变行为', '服务合约', '关联购买', '集团用户', '使用月数'], dtype='object')

In [88]:
need_data = [
    [1, 1000, 500, 0, 0, 1, 0, 25],
    [2, 0, 0, 1, 1, 2, 1, 25],
    [1, -500, -500, 0, 1, 0, 1, 13]
]

In [86]:
obj['数据转换']

{'额外通话时常': [[-3000, -1000, 0, 1000, 3000], [2, 4, 3, 1]],
 '额外流量': {'条件>': 0, 'True': 2, 'False': 1}}

In [89]:
data2 = pd.DataFrame(need_data, columns=obj['数据结构'])

In [90]:
data2

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数
0,1,1000,500,0,0,1,0,25
1,2,0,0,1,1,2,1,25
2,1,-500,-500,0,1,0,1,13


In [91]:
data2['额外通话时长'] = pd.cut(data2['额外通话时长'], bins=[-3000, -1000, 0, 1000, 3000], labels=[2, 4, 3, 1])

In [92]:
data2

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数
0,1,3,500,0,0,1,0,25
1,2,4,0,1,1,2,1,25
2,1,4,-500,0,1,0,1,13


In [96]:
data2['额外流量'] = np.where(data2['额外流量'] > 0, 2, 1)

In [97]:
data2

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数
0,1,3,2,0,0,1,0,25
1,2,4,1,1,1,2,1,25
2,1,4,1,0,1,0,1,13


In [98]:
data2['流失概率']=obj['模型'].predict(data2)

Unnamed: 0,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失概率
0,1,3,2,0,0,1,0,25,0
1,2,4,1,1,1,2,1,25,0
2,1,4,1,0,1,0,1,13,1


In [None]:
#8、交付结果