# 移动5G套餐潜客识别 

In [1]:
# 查看当前挂载的数据集目录
!ls /home/kesci/input/5G5G3453

result_predict_A.csv  result_predict_B.csv  train_label.csv  train_set.csv


# 第一步 定义问题，明确目标
对于这个项目，我们的问题或者目标就是**预测4G用户是否更换5G套餐为任务，基于数据集构建移动5G套餐潜客识别模型**，直白的来说就是构建一个5g潜在用户识别的二分类模型，给定一个移动用户数据，判断是不是更换5G套餐。更多关于需要做什么，可以自己阅读如下的**背景描述**以及**数据说明**等

## 背景描述
近年来，社会转型加速，国家正在加强培育数据要素市场、推进治理体系现代化、推进新型基础设施建设，致力打造全新智慧城市。而5G网络的大规模连接能力、高速率传输能力正是智慧城市建设的有力支撑

5G高可靠、低时延、大带宽等特性，可高效将城市系统和服务打通、集成，提升资源运用效率，优化城市管理和服务，改善市民生活质量。加快5G用户增长与城市发展深度融合，通过信息化手段解决城镇化过程中带来的问题，既是城市可持续发展所需，也是产业新动能所在。而如何通过模型精准识别5G需求潜在用户，促进4G时代向5G时代转变，以实现基于5G深度应用的智慧城市建设至关重要

基于每月用户更换5G套餐数据，分析4G用户更换5G套餐的行为特征，从更换5G套餐的4G用户的基础信息、消费行为、超套信息、宽带信息、其他信息等维度，构建5G套餐潜客识别模型，识别出目前4G用户具有更换5G套餐的需求群体，进行5G潜客营销，作为5G智慧城市打造的先头军。

# 第二步 数据搜集
本次任务已经为我们提供了黄金标准的训练集（train set）和测试集（test set）:[移动5G套餐潜客识别 ](https://www.kesci.com/mw/dataset/60505deb5316950016ec6d1b),这里省去了我们准备数据集的烦恼，可以有更多时间去做数据挖掘和模型验证。

## 数据说明
![](https://cdn.kesci.com/upload/image/qq265pwgnx.png)
![](https://cdn.kesci.com/upload/image/qq2668rg1c.png)
![](https://cdn.kesci.com/upload/image/qq26afq1z9.png)
![](https://cdn.kesci.com/upload/image/qq26ck69sr.png)
![](https://cdn.kesci.com/upload/image/qq26ddszan.png?imageView2/0/w/960/h/960)
![](https://cdn.kesci.com/upload/image/qq26ebh71o.png?imageView2/0/w/960/h/960)
![](https://cdn.kesci.com/upload/image/qq26g7ao3p.png?imageView2/0/w/960/h/960)
![](https://cdn.kesci.com/upload/image/qq26gfk02q.png?imageView2/0/w/960/h/960)
![](https://cdn.kesci.com/upload/image/qq26hf3v1a.png)

## 数据备注
预测4G用户是否更换5G套餐为任务，该数据来自重庆移动大数据平台，总数据量超过20W，包含44列变量信息，可自行对样本数据集中样本进行抽样，构建分类模型，同时会对用户号码、用户user_id进行脱敏，训练集14W，测试集6W，其中A榜单1W条，B榜单5W条；

训练集：
train_set.csv 样本数据特征集
train_lable.csv 样本数据标签分类

5G用户标签表：
![](https://cdn.kesci.com/upload/image/qq26k0a3m3.png)

数据来源
该数据来自重庆移动大数据平台

## 问题描述
移动5G套餐潜客识别模型

# 第三步 数据预处理
由于第二步我们的数据集已经是比较标准了，所以我们这一步主要做的工作是数据加载和数据预处理

## 3.1 导入库

In [2]:
import sys 
import os
print("Python version: {}". format(sys.version))
import pandas as pd # 加载csv等表格数据
print("pandas version: {}". format(pd.__version__))

import matplotlib # 画图
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #数据运算
print("NumPy version: {}". format(np.__version__))

import scipy as sp #高级数学运算
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #美化DataFrame的输出


import sklearn #机器学习算法
print("scikit-learn version: {}". format(sklearn.__version__))

#基础库
import random
import time


#忽略警告
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

Python version: 3.8.5 | packaged by conda-forge | (default, Aug 21 2020, 18:21:27) 
[GCC 7.5.0]
pandas version: 1.1.1
matplotlib version: 3.2.2
NumPy version: 1.18.5
SciPy version: 1.4.1
scikit-learn version: 0.23.2
-------------------------
5G5G3453



## 3.2 导入模型库

In [3]:
!pip install xgboost -i https://pypi.tuna.tsinghua.edu.cn/simple some-package

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [4]:
# 常见机器学习算法
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# 常见函数
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#可视化
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix


#配置可视化
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8


In [5]:
import matplotlib.font_manager as font_manager
import matplotlib as mpl
font_dirs = ['/home/kesci/work/fonts/', ]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)

mpl.rcParams['font.family'] = 'SimHei'
font_list

[<Font 'SimHei' (SimHei.ttf) normal normal 400 normal>]

In [6]:
# import numpy as np 
# import pandas as pd
# import matplotlib as mpl 
# import matplotlib.pyplot as plt
# import seaborn as sns
# light_palette = [
#     (0, 122, 255), # Blue
#     (255, 149, 0), # Orange
#     (52, 199, 89), # Green
#     (255, 59, 48), # Red
#     (175, 82, 222),# Purple
#     (255, 45, 85), # Pink
#     (88, 86, 214), # Indigo
#     (90, 200, 250),# Teal
#     (255, 204, 0)  # Yellow
# ]

# dark_palette = [
#     (10, 132, 255), # Blue
#     (255, 159, 10), # Orange
#     (48, 209, 88),  # Green
#     (255, 69, 58),  # Red
#     (191, 90, 242), # Purple
#     (94, 92, 230),  # Indigo
#     (255, 55, 95),  # Pink
#     (100, 210, 255),# Teal
#     (255, 214, 10)  # Yellow
# ]

# gray_light_palette = [
#     (142, 142, 147),# Gray
#     (174, 174, 178),# Gray (2)
#     (199, 199, 204),# Gray (3)
#     (209, 209, 214),# Gray (4)
#     (229, 229, 234),# Gray (5)
#     (242, 242, 247),# Gray (6)
# ]

# gray_dark_palette = [
#     (142, 142, 147),# Gray
#     (99, 99, 102),  # Gray (2)
#     (72, 72, 74),   # Gray (3)
#     (58, 58, 60),   # Gray (4)
#     (44, 44, 46),   # Gray (5)
#     (28, 28, 39),   # Gray (6)
# ]


# light_palette = np.array(light_palette)/255
# dark_palette = np.array(dark_palette)/255
# gray_light_palette = np.array(gray_light_palette)/255
# gray_dark_palette = np.array(gray_dark_palette)/255

# sns.palplot(light_palette)
# sns.palplot(gray_light_palette)

# sns.palplot(dark_palette)
# sns.palplot(gray_dark_palette)


In [7]:
# # 导入所需要的包
# import os
# import pandas as pd

# from cycler import cycler

# mpl.rcParams['axes.prop_cycle'] = cycler('color',light_palette)
# mpl.rcParams['figure.facecolor']  = gray_light_palette[-2]
# mpl.rcParams['figure.edgecolor']  = gray_light_palette[-2]
# mpl.rcParams['axes.facecolor'] =  gray_light_palette[-2]

# white_color = gray_light_palette[-2]
# mpl.rcParams['text.color'] = white_color
# mpl.rcParams['axes.labelcolor'] = white_color
# mpl.rcParams['axes.edgecolor'] = white_color
# mpl.rcParams['xtick.color'] = white_color
# mpl.rcParams['ytick.color'] = white_color

# mpl.rcParams['figure.dpi'] = 150

# mpl.rcParams['axes.spines.top'] = False
# mpl.rcParams['axes.spines.right'] = False

## 3.3 加载和查看数据

本步骤是加载和查看数据，通过表格的列名来快速了解数据，并了解一些有关它的信息。 它是什么样的（数据类型和值），是什么x（自变量/特征变量），什么是y（因变量/目标变量），相当于在输入到模型之前，我们需要搞清楚所用的数据到底是什么？

要进行逐个步骤，我们首先需要导入数据。 接下来，我们使用`info()`和`sample()`函数快速地进行数据的定性与定量。

In [8]:
pd.set_option('display.max_columns', 50) # 设置显示的最大列数

In [9]:
data_dir='/home/kesci/input/5G5G3453/' 
train_x=pd.read_csv(os.path.join(data_dir,'train_set.csv')) # 
train_y=pd.read_csv(os.path.join(data_dir,'train_label.csv'))
train_x.shape,train_y.shape

((140000, 45), (140000, 2))

In [10]:
train=pd.merge(train_x,train_y,how='inner',on='user_id')
train.head(3)

Unnamed: 0,user_id,product_no,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41,X42,X43,label
0,2689434779712,26231702691,女士,46.0,3.0,5,大众用户,69.3,69.0,69.0,5650.91,4233.87,3579.13,281.0,297.0,576.0,69.1,4487.97,384.67,0.0,0.0,0.0,0.0,0.0,0.0,1,0,100.0,1.0,1.0,0.0,0.0,1.0,68.0,68.0,0.0,0.0,0.0,1,,1,0,0,0,0,0
1,2697442927197,27358921188,先生,53.0,2.0,1,农村用户,19.2,13.64,12.24,0.02,0.02,0.02,250.0,198.0,370.0,15.03,0.02,272.67,13.2,7.64,6.24,0.0,0.0,0.0,0,0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0,,0,0,0,0,0,0
2,2697596026162,25912868422,女士,30.0,2.0,1,校园用户,33.94,24.0,18.0,2345.84,599.83,1259.24,68.0,113.0,104.0,25.31,1401.64,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,0.0,0.0,0.0,0.0,18.0,18.0,0.0,0.0,0.0,0,,1,0,0,0,0,0


In [11]:
# 数据类型
train.dtypes

user_id         int64
product_no      int64
X1             object
X2            float64
X3            float64
X4              int64
X5             object
X6            float64
X7            float64
X8            float64
X9            float64
X10           float64
X11           float64
X12           float64
X13           float64
X14           float64
X15           float64
X16           float64
X17           float64
X18           float64
X19           float64
X20           float64
X21           float64
X22           float64
X23           float64
X24             int64
X25             int64
X26           float64
X27           float64
X28           float64
X29           float64
X30           float64
X31           float64
X32           float64
X33           float64
X34           float64
X35           float64
X36           float64
X37             int64
X38           float64
X39             int64
X40             int64
X41             int64
X42             int64
X43             int64
label     

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140000 entries, 0 to 139999
Data columns (total 46 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   user_id     140000 non-null  int64  
 1   product_no  140000 non-null  int64  
 2   X1          140000 non-null  object 
 3   X2          140000 non-null  float64
 4   X3          133151 non-null  float64
 5   X4          140000 non-null  int64  
 6   X5          139309 non-null  object 
 7   X6          139911 non-null  float64
 8   X7          139911 non-null  float64
 9   X8          139911 non-null  float64
 10  X9          139911 non-null  float64
 11  X10         139911 non-null  float64
 12  X11         139911 non-null  float64
 13  X12         139911 non-null  float64
 14  X13         139911 non-null  float64
 15  X14         139911 non-null  float64
 16  X15         139911 non-null  float64
 17  X16         139911 non-null  float64
 18  X17         139911 non-null  float64
 19  X1

In [13]:
# 数值型特征描述
train.describe()

Unnamed: 0,user_id,product_no,X2,X3,X4,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41,X42,X43,label
count,140000.0,140000.0,140000.0,133151.0,140000.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139608.0,139608.0,139608.0,139608.0,139608.0,139608.0,140000.0,140000.0,38940.0,38940.0,139226.0,139226.0,139226.0,139226.0,133383.0,133383.0,130785.0,130207.0,130415.0,140000.0,7441.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0
mean,2696241000000.0,27778350000.0,43.295021,2.838627,1.832557,59.017699,58.757815,58.657151,2033.991338,1925.791467,1973.556651,327.723274,324.866915,335.544425,59.655756,2016.571995,331.371909,4.517615,4.474996,4.662641,2.661679,2.648488,2.539888,0.278143,0.042721,102.984694,1.0,0.279567,0.077277,0.038369,0.461961,49.364792,48.288493,5.997795,5.927695,5.88712,0.615657,1146066000.0,0.940557,0.0099,0.0433,0.133536,0.145086,0.2
std,2635831000.0,1848419000.0,10.449163,0.908406,3.03598,53.454124,51.711996,51.824485,2896.411265,2636.664139,2809.655846,374.041076,370.782504,379.987863,48.145713,2307.295639,358.227632,14.647089,14.215736,14.686451,18.796533,18.378959,17.609039,0.448086,0.202229,33.815671,0.0,0.448788,0.267032,0.192087,0.498553,42.61121,42.238787,16.183326,16.000754,15.985361,0.486441,2105618000.0,0.442037,0.099005,0.203532,0.340154,0.352188,0.400001
min,2689412000000.0,25237530000.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2697417000000.0,26118900000.0,35.0,3.0,1.0,19.69,20.0,20.0,0.11,0.2,0.23,93.0,95.0,99.0,22.01,27.125,110.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1.0,0.0,0.0,0.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,40422820.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,2697465000000.0,27414120000.0,46.0,3.0,1.0,41.64,42.0,42.0,551.79,580.28,595.75,213.0,212.0,220.0,44.2,1003.25,221.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1.0,0.0,0.0,0.0,0.0,28.0,28.0,0.0,0.0,0.0,1.0,350394600.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,2697543000000.0,28214150000.0,52.0,3.0,1.0,87.505,88.0,88.0,3248.34,3150.57,3098.16,430.0,424.0,438.0,88.17,3640.895,430.67,1.8,1.9,2.24,0.0,0.0,0.0,1.0,0.0,100.0,1.0,1.0,0.0,0.0,1.0,78.0,68.0,0.0,0.0,0.0,1.0,1327928000.0,1.0,0.0,0.0,0.0,0.0,0.0
max,2697653000000.0,32182200000.0,59.0,7.0,16.0,1895.24,1000.0,1392.65,23299.69,23697.19,23503.06,7258.0,9230.0,7212.0,1000.0,15777.36,6895.33,786.6,536.58,523.44,618.0,504.38,930.0,1.0,1.0,200.0,1.0,1.0,1.0,1.0,1.0,608.0,598.0,99.99,99.99,99.99,1.0,73069140000.0,2.0,1.0,1.0,1.0,1.0,1.0


In [14]:
test_y_a=pd.read_csv(os.path.join(data_dir,'result_predict_A.csv'))
test_y_b=pd.read_csv(os.path.join(data_dir,'result_predict_B.csv'))
test_y_a.shape,test_y_b.shape

((10000, 45), (50000, 45))

In [15]:
test=pd.concat([test_y_a,test_y_b]).reset_index(drop=True)
test.shape

(60000, 45)

In [16]:
train.columns

Index(['user_id', 'product_no', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8',
       'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18',
       'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28',
       'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38',
       'X39', 'X40', 'X41', 'X42', 'X43', 'label'],
      dtype='object')

In [17]:
# 中英文列名替换
chinese_cols=['用户标识','用户号码','性别','年龄','星级','在网时长','细分市场','当月arpu','上月arpu','上上月arpu',
'当月dou','上月dou','上上月dou','当月mou','上月mou','上上月mou',
'近三月平均arpu','近三月平均dou','近三月平均mou','当月语音超套金额','上月语音超套金额','上上月语音超套金额',
'当月流量超套金额','上月流量超套金额','上上月流量超套金额','是否本网宽带用户','是否异网宽带用户',
'宽带带宽','宽带是否激活','宽带捆绑签约标识','终端捆绑签约标识','话费签约标识','套餐签约标识','用户总套餐价值',
'用户主资费套餐','当月用户流量饱和度','上月用户流量饱和度','上上月用户流量饱和度','是否家庭用户',
'5G流量','终端类型','当月是否抵消保号用户','当月是否换机','居住地是否涵盖5g标识','工作地是否涵盖5g标识']

len(chinese_cols)

45

In [18]:
train.columns=chinese_cols+['label']
test.columns=chinese_cols
train.head(2)

Unnamed: 0,用户标识,用户号码,性别,年龄,星级,在网时长,细分市场,当月arpu,上月arpu,上上月arpu,当月dou,上月dou,上上月dou,当月mou,上月mou,上上月mou,近三月平均arpu,近三月平均dou,近三月平均mou,当月语音超套金额,上月语音超套金额,上上月语音超套金额,当月流量超套金额,上月流量超套金额,上上月流量超套金额,是否本网宽带用户,是否异网宽带用户,宽带带宽,宽带是否激活,宽带捆绑签约标识,终端捆绑签约标识,话费签约标识,套餐签约标识,用户总套餐价值,用户主资费套餐,当月用户流量饱和度,上月用户流量饱和度,上上月用户流量饱和度,是否家庭用户,5G流量,终端类型,当月是否抵消保号用户,当月是否换机,居住地是否涵盖5g标识,工作地是否涵盖5g标识,label
0,2689434779712,26231702691,女士,46.0,3.0,5,大众用户,69.3,69.0,69.0,5650.91,4233.87,3579.13,281.0,297.0,576.0,69.1,4487.97,384.67,0.0,0.0,0.0,0.0,0.0,0.0,1,0,100.0,1.0,1.0,0.0,0.0,1.0,68.0,68.0,0.0,0.0,0.0,1,,1,0,0,0,0,0
1,2697442927197,27358921188,先生,53.0,2.0,1,农村用户,19.2,13.64,12.24,0.02,0.02,0.02,250.0,198.0,370.0,15.03,0.02,272.67,13.2,7.64,6.24,0.0,0.0,0.0,0,0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0,,0,0,0,0,0,0


## 3.4 数据清洗-4C原则
**Correcting, Completing, Creating, and Converting**

1. Correcting(修正)：检查数据，似乎没有任何异常或不可接受的数据输入。 此外，我们发现我们在年龄和票价方面可能存在潜在异常值。 但是，由于它们是合理的值，我们将等到完成探索性分析后再确定是否应从数据集中包括或排除。 应该注意的是，如果它们是不合理的值，例如age = 800而不是80，那么现在修复是一个安全的决定。 但是，当我们从原始值修改数据时，我们要格外小心，因为可能需要创建准确的模型。

2. Completing(填充)：年龄，机舱和出发区域中存在空值或缺少数据。缺少值可能很糟糕，因为某些算法不知道如何处理空值，并且会失败。而其他的（例如决策树）可以处理空值。因此，在开始建模之前进行修复很重要，因为我们将比较和对比多个模型。有两种常用方法，要么删除记录，要么使用合理的输入填充缺失值。不建议删除该记录，尤其是很大比例的记录，除非它确实代表不完整的记录。相反，最好估算缺失的值。定性数据的基本方法是估算使用模式。定量数据的基本方法是使用均值，中位数或均值+随机标准差估算。一种中间方法是根据特定标准使用基本方法。例如按舱位划分的平均年龄，或按票价和SES出发前往港口。有更复杂的方法，但是在部署之前，应将其与基本模型进行比较，以确定复杂性是否真正增加了价值。对于此数据集，年龄将以中位数估算，机舱属性将被删除，登船将以模式估算。随后的模型迭代可能会修改此决策，以确定它是否会提高模型的准确性。

3. Creating(创造)：特征工程是当我们使用现有特征来创建新特征以确定它们是否提供新信号来预测我们的结果时。 对于此数据集，我们将创建标题功能以确定其是否在生存中起作用。
4. Converting(转换)：最后，但同样重要的是，我们将处理格式化。 没有日期或货币格式，但有数据类型格式。 我们的分类数据作为对象导入，这使得进行数学计算变得困难。 对于此数据集，我们会将对象数据类型转换为分类虚拟变量。

In [19]:
print('训练集每列包括空值个数:\n', train.isnull().sum())
print("-"*10)

print('测试集每列包括空值个数:\n', test.isnull().sum())
print("-"*10)

train.describe(include = 'all')


训练集每列包括空值个数:
 用户标识                0
用户号码                0
性别                  0
年龄                  0
星级               6849
在网时长                0
细分市场              691
当月arpu             89
上月arpu             89
上上月arpu            89
当月dou              89
上月dou              89
上上月dou             89
当月mou              89
上月mou              89
上上月mou             89
近三月平均arpu          89
近三月平均dou           89
近三月平均mou           89
当月语音超套金额          392
上月语音超套金额          392
上上月语音超套金额         392
当月流量超套金额          392
上月流量超套金额          392
上上月流量超套金额         392
是否本网宽带用户            0
是否异网宽带用户            0
宽带带宽           101060
宽带是否激活         101060
宽带捆绑签约标识          774
终端捆绑签约标识          774
话费签约标识            774
套餐签约标识            774
用户总套餐价值          6617
用户主资费套餐          6617
当月用户流量饱和度        9215
上月用户流量饱和度        9793
上上月用户流量饱和度       9585
是否家庭用户              0
5G流量           132559
终端类型                0
当月是否抵消保号用户          0
当月是否换机              0
居住地是否涵盖5g标识         0
工作地是否涵盖5g标识       

Unnamed: 0,用户标识,用户号码,性别,年龄,星级,在网时长,细分市场,当月arpu,上月arpu,上上月arpu,当月dou,上月dou,上上月dou,当月mou,上月mou,上上月mou,近三月平均arpu,近三月平均dou,近三月平均mou,当月语音超套金额,上月语音超套金额,上上月语音超套金额,当月流量超套金额,上月流量超套金额,上上月流量超套金额,是否本网宽带用户,是否异网宽带用户,宽带带宽,宽带是否激活,宽带捆绑签约标识,终端捆绑签约标识,话费签约标识,套餐签约标识,用户总套餐价值,用户主资费套餐,当月用户流量饱和度,上月用户流量饱和度,上上月用户流量饱和度,是否家庭用户,5G流量,终端类型,当月是否抵消保号用户,当月是否换机,居住地是否涵盖5g标识,工作地是否涵盖5g标识,label
count,140000.0,140000.0,140000,140000.0,133151.0,140000.0,139309,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139911.0,139608.0,139608.0,139608.0,139608.0,139608.0,139608.0,140000.0,140000.0,38940.0,38940.0,139226.0,139226.0,139226.0,139226.0,133383.0,133383.0,130785.0,130207.0,130415.0,140000.0,7441.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0
unique,,,2,,,,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,,先生,,,,大众用户,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,,72405,,,,70076,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,2696241000000.0,27778350000.0,,43.295021,2.838627,1.832557,,59.017699,58.757815,58.657151,2033.991338,1925.791467,1973.556651,327.723274,324.866915,335.544425,59.655756,2016.571995,331.371909,4.517615,4.474996,4.662641,2.661679,2.648488,2.539888,0.278143,0.042721,102.984694,1.0,0.279567,0.077277,0.038369,0.461961,49.364792,48.288493,5.997795,5.927695,5.88712,0.615657,1146066000.0,0.940557,0.0099,0.0433,0.133536,0.145086,0.2
std,2635831000.0,1848419000.0,,10.449163,0.908406,3.03598,,53.454124,51.711996,51.824485,2896.411265,2636.664139,2809.655846,374.041076,370.782504,379.987863,48.145713,2307.295639,358.227632,14.647089,14.215736,14.686451,18.796533,18.378959,17.609039,0.448086,0.202229,33.815671,0.0,0.448788,0.267032,0.192087,0.498553,42.61121,42.238787,16.183326,16.000754,15.985361,0.486441,2105618000.0,0.442037,0.099005,0.203532,0.340154,0.352188,0.400001
min,2689412000000.0,25237530000.0,,13.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2697417000000.0,26118900000.0,,35.0,3.0,1.0,,19.69,20.0,20.0,0.11,0.2,0.23,93.0,95.0,99.0,22.01,27.125,110.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1.0,0.0,0.0,0.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,40422820.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,2697465000000.0,27414120000.0,,46.0,3.0,1.0,,41.64,42.0,42.0,551.79,580.28,595.75,213.0,212.0,220.0,44.2,1003.25,221.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1.0,0.0,0.0,0.0,0.0,28.0,28.0,0.0,0.0,0.0,1.0,350394600.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,2697543000000.0,28214150000.0,,52.0,3.0,1.0,,87.505,88.0,88.0,3248.34,3150.57,3098.16,430.0,424.0,438.0,88.17,3640.895,430.67,1.8,1.9,2.24,0.0,0.0,0.0,1.0,0.0,100.0,1.0,1.0,0.0,0.0,1.0,78.0,68.0,0.0,0.0,0.0,1.0,1327928000.0,1.0,0.0,0.0,0.0,0.0,0.0


### 3.4.1 填充缺失值

*   [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
*   [pandas.DataFrame.info](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.info.html)
*   [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html)
*   [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/indexing.html)
*   [pandas.isnull](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.isnull.html)
*   [pandas.DataFrame.sum](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sum.html)
*   [pandas.DataFrame.mode](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.mode.html)
*   [pandas.DataFrame.copy](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.copy.html)
*   [pandas.DataFrame.fillna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html)
*   [pandas.DataFrame.drop](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html)
*   [pandas.Series.value_counts](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html)
*   [pandas.DataFrame.loc](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.loc.html)


In [20]:
# 缺失值统计代码块：https://www.kesci.com/home/gist/604c3d1e89c8740015288ec0?Token=d33faafd5e463f2f
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns


In [21]:
# 缺失值统计
missing_values_table(train)

Your selected dataframe has 46 columns.
There are 32 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
5G流量,132559,94.7
宽带带宽,101060,72.2
宽带是否激活,101060,72.2
上月用户流量饱和度,9793,7.0
上上月用户流量饱和度,9585,6.8
当月用户流量饱和度,9215,6.6
星级,6849,4.9
用户主资费套餐,6617,4.7
用户总套餐价值,6617,4.7
套餐签约标识,774,0.6


In [22]:
no_features=['用户标识','label']
numberical_cols=[col for col in train.select_dtypes('number').columns if col not in no_features]
categorical_cols=[col for col in train.columns if col not in no_features+numberical_cols]
print(len(no_features),len(categorical_cols),len(numberical_cols))
# numberical_cols

2 2 42


由于列数和存在大量缺失值比较多，我们简单地先通过遍历对每列进行填充

In [23]:
for dataset in [train,test]:  
    # 通过中位数对数值型变量缺失值填充
    for col in  numberical_cols:
        dataset[col].fillna(dataset[col].median(), inplace = True)
    # 通过众数对类别性变量缺失值填充
    for col in categorical_cols:
        dataset[col].fillna(dataset[col].mode()[0], inplace = True)
    

In [24]:
print(train.isnull().sum())
print("-"*10)
print(test.isnull().sum())

用户标识           0
用户号码           0
性别             0
年龄             0
星级             0
在网时长           0
细分市场           0
当月arpu         0
上月arpu         0
上上月arpu        0
当月dou          0
上月dou          0
上上月dou         0
当月mou          0
上月mou          0
上上月mou         0
近三月平均arpu      0
近三月平均dou       0
近三月平均mou       0
当月语音超套金额       0
上月语音超套金额       0
上上月语音超套金额      0
当月流量超套金额       0
上月流量超套金额       0
上上月流量超套金额      0
是否本网宽带用户       0
是否异网宽带用户       0
宽带带宽           0
宽带是否激活         0
宽带捆绑签约标识       0
终端捆绑签约标识       0
话费签约标识         0
套餐签约标识         0
用户总套餐价值        0
用户主资费套餐        0
当月用户流量饱和度      0
上月用户流量饱和度      0
上上月用户流量饱和度     0
是否家庭用户         0
5G流量           0
终端类型           0
当月是否抵消保号用户     0
当月是否换机         0
居住地是否涵盖5g标识    0
工作地是否涵盖5g标识    0
label          0
dtype: int64
----------
用户标识           0
用户号码           0
性别             0
年龄             0
星级             0
在网时长           0
细分市场           0
当月arpu         0
上月arpu         0
上上月arpu        0
当月dou          0
上月dou  

### 3.4.2 类别变量编码

一种思路：对于类别型变量，训练集合测试集可能存在一些交易之外的值，比如训练集中的X5只有农村用户，测试集有大众用户，农村用户，鉴于这种情况，我们可以先将训练集和测试集进行合并，得到每个类别变量的LabelEncoder，然后进行转换

In [25]:
tmp=pd.concat([train,test],axis=0)
# 类别变量：X1和X5
label = LabelEncoder()
for col in ['性别','细分市场']:
    for dataset in [train,test]:
        label.fit(tmp[col])
        dataset[col]=label.transform(dataset[col])

In [26]:
train['性别']

0         1
1         0
2         1
3         0
4         1
         ..
139995    0
139996    0
139997    0
139998    0
139999    1
Name: 性别, Length: 140000, dtype: int64

### 3.4.3 再次检查清洗后的数据

In [27]:
print('训练集每列包括空值个数:\n', train.isnull().sum())
print("-"*10)

print('测试集每列包括空值个数:\n', test.isnull().sum())
print("-"*10)

train.describe(include = 'all')


训练集每列包括空值个数:
 用户标识           0
用户号码           0
性别             0
年龄             0
星级             0
在网时长           0
细分市场           0
当月arpu         0
上月arpu         0
上上月arpu        0
当月dou          0
上月dou          0
上上月dou         0
当月mou          0
上月mou          0
上上月mou         0
近三月平均arpu      0
近三月平均dou       0
近三月平均mou       0
当月语音超套金额       0
上月语音超套金额       0
上上月语音超套金额      0
当月流量超套金额       0
上月流量超套金额       0
上上月流量超套金额      0
是否本网宽带用户       0
是否异网宽带用户       0
宽带带宽           0
宽带是否激活         0
宽带捆绑签约标识       0
终端捆绑签约标识       0
话费签约标识         0
套餐签约标识         0
用户总套餐价值        0
用户主资费套餐        0
当月用户流量饱和度      0
上月用户流量饱和度      0
上上月用户流量饱和度     0
是否家庭用户         0
5G流量           0
终端类型           0
当月是否抵消保号用户     0
当月是否换机         0
居住地是否涵盖5g标识    0
工作地是否涵盖5g标识    0
label          0
dtype: int64
----------
测试集每列包括空值个数:
 用户标识           0
用户号码           0
性别             0
年龄             0
星级             0
在网时长           0
细分市场           0
当月arpu         0
上月arpu         0
上上月arpu      

Unnamed: 0,用户标识,用户号码,性别,年龄,星级,在网时长,细分市场,当月arpu,上月arpu,上上月arpu,当月dou,上月dou,上上月dou,当月mou,上月mou,上上月mou,近三月平均arpu,近三月平均dou,近三月平均mou,当月语音超套金额,上月语音超套金额,上上月语音超套金额,当月流量超套金额,上月流量超套金额,上上月流量超套金额,是否本网宽带用户,是否异网宽带用户,宽带带宽,宽带是否激活,宽带捆绑签约标识,终端捆绑签约标识,话费签约标识,套餐签约标识,用户总套餐价值,用户主资费套餐,当月用户流量饱和度,上月用户流量饱和度,上上月用户流量饱和度,是否家庭用户,5G流量,终端类型,当月是否抵消保号用户,当月是否换机,居住地是否涵盖5g标识,工作地是否涵盖5g标识,label
count,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0
mean,2696241000000.0,27778350000.0,0.482821,43.295021,2.846521,1.832557,1.0961,59.006652,58.747162,58.646562,2033.049082,1924.936106,1972.68076,327.650343,324.795164,335.470971,59.645931,2015.927811,331.30217,4.504965,4.462466,4.649586,2.654226,2.641073,2.532776,0.278143,0.042721,100.830171,1.0,0.278021,0.07685,0.038157,0.459407,48.355,47.329571,5.603012,5.513053,5.484063,0.615657,392684500.0,0.940557,0.0099,0.0433,0.133536,0.145086,0.2
std,2635831000.0,1848419000.0,0.499707,10.449163,0.886591,3.03598,0.974885,53.438925,51.697282,51.809711,2895.731476,2636.04409,2808.977315,373.933345,370.675545,379.878225,48.131983,2306.703535,358.124422,14.628516,14.197789,14.667944,18.770727,18.353744,17.584881,0.448086,0.202229,17.884043,0.0,0.448026,0.266354,0.191576,0.498351,41.838386,41.452687,15.712202,15.504872,15.499911,0.486441,517182700.0,0.442037,0.099005,0.203532,0.340154,0.352188,0.400001
min,2689412000000.0,25237530000.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2697417000000.0,26118900000.0,0.0,35.0,3.0,1.0,0.0,19.7,20.0,20.0,0.11,0.2,0.24,93.0,95.0,99.0,22.03,27.28,110.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1.0,0.0,0.0,0.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,350394600.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,2697465000000.0,27414120000.0,0.0,46.0,3.0,1.0,1.0,41.64,42.0,42.0,551.79,580.28,595.75,213.0,212.0,220.0,44.2,1003.25,221.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,1.0,0.0,0.0,0.0,0.0,28.0,28.0,0.0,0.0,0.0,1.0,350394600.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,2697543000000.0,28214150000.0,1.0,52.0,3.0,1.0,1.0,87.43,88.0,88.0,3245.425,3147.7025,3096.8175,430.0,424.0,437.0,88.13,3639.6075,430.33,1.8,1.9,2.18,0.0,0.0,0.0,1.0,0.0,100.0,1.0,1.0,0.0,0.0,1.0,78.0,68.0,0.0,0.0,0.0,1.0,350394600.0,1.0,0.0,0.0,0.0,0.0,0.0
max,2697653000000.0,32182200000.0,1.0,59.0,7.0,16.0,3.0,1895.24,1000.0,1392.65,23299.69,23697.19,23503.06,7258.0,9230.0,7212.0,1000.0,15777.36,6895.33,786.6,536.58,523.44,618.0,504.38,930.0,1.0,1.0,200.0,1.0,1.0,1.0,1.0,1.0,608.0,598.0,99.99,99.99,99.99,1.0,73069140000.0,2.0,1.0,1.0,1.0,1.0,1.0


# 第四步 探索性数据分析-EDA 
EDA即Exploratory Data Analysis，数据探索性分析，我们分下下每个变量的分布以及与标签的相关性


现在，我们的数据已清理完毕，我们将使用描述性和图形统计数据来探索数据，以描述和总结变量的特性和特点。 在这个步骤，除了特征可视化之外，我们需要最大限度地观察它们与目标变量以及彼此之间的相关性。

## 4.1 类散变量与目标变量的相关性

In [28]:
# 类散变量与目标变量的相关性

for x in train:
    if train[x].dtype != 'float64'  and x not in no_features:
        print('5G 相关性 :', x)
        print(train[[x, 'label']].groupby(x, as_index=False).mean())
        print('-'*10, '\n')

#使用 crosstabs方法: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.crosstab.html
print(pd.crosstab(train['性别'],train['label']))

5G 相关性 : 用户号码
               用户号码  label
0       25237533946      0
1       25237553147      1
2       25237565905      0
3       25237568475      0
4       25237598239      0
...             ...    ...
139995  32182026222      0
139996  32182038831      0
139997  32182113427      0
139998  32182160449      1
139999  32182199532      0

[140000 rows x 2 columns]
---------- 

5G 相关性 : 性别
   性别     label
0   0  0.231393
1   1  0.166373
---------- 

5G 相关性 : 在网时长
    在网时长     label
0      0  0.243590
1      1  0.189192
2      2  0.231437
3      3  0.288462
4      4  0.262963
5      5  0.271739
6      6  0.277574
7      7  0.278970
8      8  0.235457
9      9  0.286089
10    10  0.270358
11    11  0.297297
12    12  0.313811
13    13  0.300412
14    14  0.319101
15    15  0.328557
16    16  0.437373
---------- 

5G 相关性 : 细分市场
   细分市场     label
0     0  0.145888
1     1  0.204615
2     2  0.119964
3     3  0.313937
---------- 

5G 相关性 : 是否本网宽带用户
   是否本网宽带用户     label
0         0  0.112468
1

## 4.2 数值变量分箱和直方图分布

In [29]:
# 数值变量分布
plt.figure(figsize=[16,12])

plt.subplot(231)
plt.boxplot(x=train['在网时长'], showmeans = True, meanline = True)
plt.title('在网时长 Boxplot')
plt.ylabel('在网时长')

plt.subplot(232)
plt.boxplot(x=train['年龄'], showmeans = True, meanline = True)
plt.title('年龄 Boxplot')
plt.ylabel('年龄 (年)')


plt.subplot(233)
plt.boxplot(train['宽带带宽'], showmeans = True, meanline = True)
plt.title('宽带带宽 Boxplot')
plt.ylabel('宽带带宽 (#)')

plt.subplot(234)
plt.hist(x = [train[train['label']==1]['在网时长'], train[train['label']==0]['在网时长']], 
         stacked=True, color = ['g','r'],label = ['label','5g'])
plt.title('在网时长 Histogram by 5g user')
plt.xlabel('在网时长 ($)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(235)
plt.hist(x = [train[train['label']==1]['年龄'], train[train['label']==0]['年龄']], 
         stacked=True, color = ['g','r'],label = ['label','5g'])
plt.title('年龄 Histogram by label')
plt.xlabel('年龄 (Years)')
plt.ylabel('# of Passengers')
plt.legend()


plt.subplot(236)
plt.hist(x = [train[train['label']==1]['宽带带宽'], train[train['label']==0]['宽带带宽']], 
         stacked=True, color = ['g','r'],label = ['label','5g'])
plt.title('宽带带宽 Histogram by label')
plt.xlabel('宽带带宽 (#)')
plt.ylabel('# of Passengers')
plt.legend()

<matplotlib.legend.Legend at 0x7fed96f43490>

## 4.3 离散变量的统计树状图

In [30]:
# 离散变量
fig, saxis = plt.subplots(2, 3,figsize=(16,12))

sns.barplot(x = '性别', y = 'label', data=train, ax = saxis[0,0])
sns.barplot(x = '星级', y = 'label', order=[1,2,3], data=train, ax = saxis[0,1])
sns.barplot(x = '细分市场', y = 'label', order=[1,0], data=train, ax = saxis[0,2])

sns.pointplot(x = '终端类型', y = 'label',  data=train, ax = saxis[1,0])
sns.pointplot(x = '是否本网宽带用户', y = 'label',  data=train, ax = saxis[1,1])
sns.pointplot(x = '当月是否换机', y = 'label', data=train, ax = saxis[1,2])

<matplotlib.axes._subplots.AxesSubplot at 0x7fed99b364c0>

## 4.4 是否成为5g用户的变量分析

In [31]:
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(14,12))

sns.boxplot(x = '细分市场', y = '在网时长', hue = 'label', data = train, ax = axis1)
axis1.set_title('细分市场 vs 在网时长 label 对比')

sns.violinplot(x = '细分市场', y = '年龄', hue = 'label', data = train, split = True, ax = axis2)
axis2.set_title('细分市场 vs 年龄 label 对比')

sns.boxplot(x = '细分市场', y ='当月语音超套金额', hue = 'label', data = train, ax = axis3)
axis3.set_title('细分市场 vs 当月语音超套金额  label 对比')


Text(0.5, 1.0, '细分市场 vs 当月语音超套金额  label 对比')

In [32]:
fig, qaxis = plt.subplots(1,3,figsize=(14,12))

sns.barplot(x = '性别', y = 'label', hue = '是否家庭用户', data=train, ax = qaxis[0])
axis1.set_title('性别 vs 是否家庭用户 label 对比')

sns.barplot(x = '性别', y = 'label', hue = '终端类型', data=train, ax  = qaxis[1])
axis1.set_title('性别 vs 终端类型 label 对比')

sns.barplot(x = '性别', y = 'label', hue = '居住地是否涵盖5g标识', data=train, ax  = qaxis[2])
axis1.set_title('性别 vs 居住地是否涵盖5g标识 label 对比')


Text(0.5, 1.0, '性别 vs 居住地是否涵盖5g标识 label 对比')

In [33]:
fig, (maxis1, maxis2) = plt.subplots(1, 2,figsize=(14,12))

#how does 细分市场 factor with 性别 & label compare
sns.pointplot(x="细分市场", y="label", hue="性别", data=train,
              palette={1: "blue", 0: "pink"},
              markers=["*", "o"], linestyles=["-", "--"], ax = maxis1)

#how does 上网市场 factor with 性别 & label compare
sns.pointplot(x="在网时长", y="label", hue="性别", data=train,
              palette={1: "blue", 0: "pink"},
              markers=["*", "o"], linestyles=["-", "--"], ax = maxis2)


<matplotlib.axes._subplots.AxesSubplot at 0x7fed96bf7490>

In [34]:
#how does 终端类型  factor with 细分市场, 性别, and label compare

e = sns.FacetGrid(train, col = '终端类型')
e.map(sns.pointplot, '细分市场', 'label', '性别', ci=95.0, palette = 'deep')
e.add_legend()


<seaborn.axisgrid.FacetGrid at 0x7fed96b185b0>

In [35]:
a = sns.FacetGrid( train, hue = 'label', aspect=4 )
a.map(sns.kdeplot, '年龄', shade= True )
a.set(xlim=(0 , train['年龄'].max()))
a.add_legend()

<seaborn.axisgrid.FacetGrid at 0x7fed99a8fd00>

In [36]:
h = sns.FacetGrid(train, row = '性别', col = '细分市场', hue = 'label')
h.map(plt.hist, '年龄', alpha = .75)
h.add_legend()


<seaborn.axisgrid.FacetGrid at 0x7fed922554c0>

In [37]:
pp = sns.pairplot(train[['性别','年龄','星级','在网时长','细分市场','label']], hue = 'label', palette = 'deep', size=1.2, diag_kind = 'kde', diag_kws=dict(shade=True), plot_kws=dict(s=10) )
pp.set(xticklabels=[])


<seaborn.axisgrid.PairGrid at 0x7fed91ebb490>

In [38]:
#相关性系数热力图
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(20, 20))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':10 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(train)

# 第五步 构建和训练模型
在构建和训练模型之前，我们再回顾下几个概念：

**数据科学**是**数学**（即统计，线性代数等），**计算机科学**（即编程语言，计算机系统等）和**业务管理**（即通信，领域知识等）组成的**交叉学科**。 大多数数据科学家都来自三个领域之一，因此他们倾向于遵循该学科。 但是，数据科学就像一条三腿凳，一条腿比另一条腿重要。 因此，此步骤将需要数学方面的高级知识。 但是不用担心，我们现在只需要一个总体的概述，我们将在本Notebook中进行介绍。 同样，在人工智能和相关计算机技术的帮助下，许多繁重的工作我们自己都可以完成。 因此，曾经需要数学或统计学专业知识才能解决的问题，现在只需要几行代码就可以完成。 最后，我们需要一些商业头脑来思考问题。 毕竟，就像训练导盲犬一样，它是向我们学习，而不是相反。


顾名思义，机器学习（ML）正在教授机器如何思考而不是思考。尽管这个话题和大数据已经存在了数十年，但它却比以往任何时候都更加受欢迎，因为对于企业和专业人士而言，进入门槛都更低。这是好是坏。很好，因为这些算法现在可供更多人使用，可以解决现实世界中的更多问题。不好是因为较低的进入门槛意味着，更多的人将不知道他们使用的工具，并且可能得出错误的结论。因此，我专注于教您，不仅教您做什么，而且教您为什么做。以前，我曾用过一个比喻，就是要求某人递给您一把Philip螺丝刀，然后他们递给您一字螺丝刀，甚至更糟的是锤子。充其量说，它完全缺乏了解。最坏的情况是，它使完成项目变得不可能。甚至更糟的是，会实施错误的可操作情报。因此，既然我已经敲定了（没有双关语），我将向您展示该怎么做，最重要的是，为什么您要这样做。



首先，我们必须了解，机器学习的目的是解决人类问题。机器学习可分为：监督学习，无监督学习和强化学习。监督学习是通过向模型提供包含正确答案的训练数据集来训练模型的地方。在无监督学习中，您可以使用不包含正确答案的训练数据集来训练模型。强化学习是前两种方法的混合，在这种情况下，模型不会立即得到正确答案，而是在一系列事件之后才得到正确的学习。我们正在进行有监督的机器学习，因为我们正在通过向算法展示一组功能及其相应的目标来训练我们的算法。然后，我们希望从相同的数据集中为它提供一个新的子集，并且在预测准确性方面具有相似的结果。

机器学习算法有很多，但是根据目标变量和数据建模目标的不同，它们可分为四类：分类，回归，聚类或降维。我们将保留聚类和降维的另一天，并将重点放在分类和回归上。我们可以概括地说，连续目标变量需要回归算法，而离散目标变量则需要分类算法。旁注，逻辑回归虽然名称上具有回归，但实际上是一种分类算法。由于我们的问题是预测乘客是否幸存下来，因此这是一个离散的目标变量。我们将使用sklearn库中的分类算法来开始我们的分析。我们将使用交叉验证和评分指标（在后面的部分中进行讨论）来对算法的性能进行排名和比较。

In [39]:
no_features

['用户标识', 'label']

In [40]:
features=[col for    col in train.columns  if  col not in no_features]

## 5.1 模型构建

下面给大家列举了很多常见的机器学习算法，如果需要了解原理的话，可以去查找一些论文、博客等，这里我们就抛砖引玉，简单地通过scikit-learn来调用实现这些算法。如果等待时间长的话，可以减少一些算法。

In [41]:
MLA = [
    #集成方法
    ensemble.AdaBoostClassifier(),
    # ensemble.BaggingClassifier(),
    # ensemble.ExtraTreesClassifier(),
    # ensemble.GradientBoostingClassifier(),
    # ensemble.RandomForestClassifier(),

    #高斯过程
    # gaussian_process.GaussianProcessClassifier(),
    
    #非线性分类器
    linear_model.LogisticRegressionCV(),
    # linear_model.PassiveAggressiveClassifier(),
    # linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #贝叶斯
    # naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #KNN算法
    neighbors.KNeighborsClassifier(),
    
    #支持向量机-SVM
    # svm.SVC(probability=True),
    # svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #决策树模型    
    tree.DecisionTreeClassifier(),
    # tree.ExtraTreeClassifier(),
    
    #奇异值分析
    # discriminant_analysis.LinearDiscriminantAnalysis(),
    # discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]

## 5.2 模型训练

In [42]:

cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
train['pred']=-1
MLA_predict = train['pred']

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    print(MLA_name)
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    cv_results = model_selection.cross_validate(alg, train[features], 
    train['label'], 
    cv  = cv_split,
    return_train_score=True)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(train[features], train['label'])
    MLA_predict[MLA_name] = alg.predict(train[features])
    
    row_index+=1

MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

AdaBoostClassifier
LogisticRegressionCV
SGDClassifier
Perceptron
GaussianNB
KNeighborsClassifier
LinearSVC
DecisionTreeClassifier
XGBClassifier


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
8,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.908498,0.876364,0.00388391,16.0529
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.86448,0.863643,0.00453973,7.64479
7,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",1.0,0.81271,0.00476721,2.50674
1,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...",0.799501,0.799938,0.00337294,10.9933
4,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.798244,0.798707,0.00260828,0.108469
5,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.835029,0.776748,0.00333216,11.9863
6,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...",0.739802,0.740024,0.540606,20.531
3,Perceptron,"{'alpha': 0.0001, 'class_weight': None, 'early...",0.739819,0.739821,0.541144,0.278014
2,SGDClassifier,"{'alpha': 0.0001, 'average': False, 'class_wei...",0.619745,0.619498,0.825071,7.53094


In [43]:
cv_results

{'fit_time': array([16.05114555, 16.12055707, 16.22180748, 15.99050403, 15.91929531,
        15.91836023, 16.33009815, 16.03341937, 15.92039204, 16.02299118]),
 'score_time': array([0.07120895, 0.06696987, 0.03545213, 0.03017664, 0.06203675,
        0.07413411, 0.07006216, 0.07162356, 0.03145766, 0.06702137]),
 'test_score': array([0.87766667, 0.87483333, 0.87592857, 0.87347619, 0.87771429,
        0.87678571, 0.87604762, 0.87728571, 0.87747619, 0.87642857]),
 'train_score': array([0.90727381, 0.90942857, 0.90930952, 0.9089881 , 0.90994048,
        0.90796429, 0.90957143, 0.9079881 , 0.90735714, 0.90715476])}

## 5.2 模型效果

In [44]:
#柱状图  https://seaborn.pydata.org/generated/seaborn.barplot.html
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')

#pyplot 美化: https://matplotlib.org/api/pyplot_api.html
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')


Text(0, 0.5, 'Algorithm')

In [47]:
MLA_predict

0                                                                        -1
1                                                                        -1
2                                                                        -1
3                                                                        -1
4                                                                        -1
                                                ...                        
GaussianNB                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
KNeighborsClassifier      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
LinearSVC                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
DecisionTreeClassifier    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
XGBClassifier             [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
Name: pred, Length: 140009, dtype: object

# 第六步 模型集成

In [None]:
vote_est = [
    #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
    ('ada', ensemble.AdaBoostClassifier()),
    # ('bc', ensemble.BaggingClassifier()),
    # ('etc',ensemble.ExtraTreesClassifier()),
    # ('gbc', ensemble.GradientBoostingClassifier()),
    # ('rfc', ensemble.RandomForestClassifier()),

    #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
    # ('gpc', gaussian_process.GaussianProcessClassifier()),
    
    #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
    ('lr', linear_model.LogisticRegressionCV()),
    
    #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
    # ('bnb', naive_bayes.BernoulliNB()),
    ('gnb', naive_bayes.GaussianNB()),
    
    #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
    ('knn', neighbors.KNeighborsClassifier()),
    
    #SVM: http://scikit-learn.org/stable/modules/svm.html
    ('svc', svm.SVC(probability=True)),
    ('xgb', XGBClassifier())

]


#多数投票
vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
vote_hard_cv = model_selection.cross_validate(vote_hard, train[features], train['label'], cv  = cv_split)
vote_hard.fit(train[features], train['label'])

print("Hard Voting Training w/bin score mean: {:.2f}". format(vote_hard_cv['train_score'].mean()*100)) 
print("Hard Voting Test w/bin score mean: {:.2f}". format(vote_hard_cv['test_score'].mean()*100))
print("Hard Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_hard_cv['test_score'].std()*100*3))
print('-'*10)


#权重投票
vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, train[features]], train['label'], cv  = cv_split)
vote_soft.fit(train[features], train['label'])

print("Soft Voting Training w/bin score mean:{:.2f}". format(vote_soft_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_soft_cv['test_score'].std()*100*3))
print('-'*10)


# 第七步 模型优化

- 特征工程
- 超参数优化

In [None]:
def score_distribution_kde(num_col,cat_col,title,dataset):
    fig = plt.figure(figsize=(10, 7))
    gs = fig.add_gridspec(6, 5)
    ax = fig.add_subplot(gs[:5,:])

    sns.kdeplot(x=num_col, hue=cat_col, data=dataset,
                hue_order=sorted(dataset[cat_col].unique()),
                bw_adjust=0.4,
                fill=True,ax=ax)
    
    plt.title(title,fontsize='large',fontweight='bold')

    sub_axes = [None] * 5
    for idx, group in enumerate(sorted(dataset[cat_col].unique())):
        sub_axes[idx] = fig.add_subplot(gs[5,idx])
        sns.kdeplot(x=num_col, data=dataset,
                    alpha=0.2, 
                  color= white_color,
                  linewidth=0.7, 
                  label=group, fill=True, bw_adjust=0.4,
                  zorder=5, ax=sub_axes[idx]
                  )

        sns.kdeplot(x=num_col, data=dataset[dataset[cat_col]==group],
                    alpha=0.6, 
                      color= dark_palette[idx],
                      linewidth=0.5, 
                      label=group, fill=True,bw_adjust=0.4,
                      zorder=10, ax=sub_axes[idx]
                  )
        cnt = (dataset[cat_col]==group).sum()
        sub_axes[idx].set_xticks([])
        sub_axes[idx].set_yticks([])
        sub_axes[idx].set_xlabel('')
        sub_axes[idx].set_ylabel('')

    ax.set_title(num_col.capitalize(), loc='left', fontweight='bold', fontsize=13)
    fig.tight_layout()
    plt.show()


In [None]:
# score_distribution_kde('X2','X1',"用户不同性别的年龄分布",train)

In [None]:
# X5 用户市场：普通用户
score_distribution_kde('X2','X5',"不同用户市场的年龄分布",train.dropna(subset=['X5']))

In [None]:
# sns.pairplot(train, hue='label', corner=True)