<a href="https://colab.research.google.com/github/All-Natural/python/blob/master/General%E9%80%9A%E8%AF%86.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Employee resignation analysis

1. 进行数据展示
2. 简单的EDA分析(更细节的分析在单独的EDA分析中)
3. 数据预处理建模
4. 算法对比（LR, RF, LGBM...）(这个在另附一篇ML算法的对比上)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import os
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# 先载入IBM HR DATA这个数据集然后对照着IBM HR DATA (Edited)来看看Attrition那一列0和1代表的都是什么
dataoriginal = pd.read_csv('IBM HR DATA.csv')
attrition_classes = pd.value_counts(dataoriginal["Attrition"],sort = True).sort_index()
attrition_classes.plot(kind = "bar")
plt.xticks(rotation=0) 
plt.title("Attrition Show")

In [None]:
# 载入数据
data = pd.read_csv("IBM HR Data (Edited).csv")
data.head()

In [None]:
data.shape
# 一共有23423条样本，每条样本共计28个特征

In [None]:
data.info()

#### Missing_value

In [None]:
def missing_value_table(df):
    # 计算所有缺失值
    mis_val = df.isnull().sum()
    
    # 求出百分比
    mis_val_percent = 100*df.isnull().sum() / len(df)
    
    # 做成表格展示出来，合并
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis=1)
    mis_val_rename = mis_val_table.rename(columns = {0:"Missing values", 1:"% of total values"})
    
    # 剔除完整的数据（无缺失值）并对现有数据进行排序
    mis_val_rename = mis_val_rename[mis_val_rename.iloc[:,1] != 0].sort_values("% of total values", ascending = False) 
    # 取所有数据的第一列， 并进行降序排列
    
    return mis_val_rename

In [None]:
missing_value_table(data)[:10]

#### Object

In [None]:
data.dtypes.value_counts()

In [None]:
data.select_dtypes('object').apply(pd.Series.nunique,axis = 0)

In [None]:
data = pd.get_dummies(data)
data.shape

In [None]:
le = LabelEncoder()
for col in data :
    if data[col].dtypes == "object" :
        if len (list(data[col].unique())) <= 2:
            le.fit(data[col])
            data[col] = le.transform(data[col])

### EDA

#### Attrition Show

In [None]:
attrition_classes = pd.value_counts(data["Attrition"],sort = True).sort_index()
attrition_classes.plot(kind = "bar")
plt.xticks(rotation=0) 
plt.title("Attrition Show")

In [None]:
pd.value_counts(data["Attrition"], sort = True).sort_index()

#### 0代表工作，1代表辞职

In [None]:
# The correlation coefficient 引入corr相关系数
# 1<=x<=1,1 is the most relevant, and minus 1 is the most outlier
correlations = data.corr()['Attrition'].sort_values()
correlations.head()

In [None]:
correlations.tail(6)

从corr系数可以看到OverTime_Yes是对于辞职与否的最大影响因素，其次是DistanceFromHome , NumCompaniesWorked , HourlyRate  

In [None]:
# 看一下影响因素最大的'OverTime'是什么样子的
OverTime_classes = pd.value_counts(dataoriginal["OverTime"],sort = True).sort_index()
OverTime_classes.plot(kind = "bar")
plt.xticks(rotation=0) 
plt.title("OverTime Show")

In [None]:
pd.value_counts(data["OverTime_Yes"], sort = True).sort_index()

#### 1代表加班，0代表不加班

In [None]:
plt.figure(figsize = (10,8))
#KDEPLOT
sns.kdeplot(data.loc[data['Attrition']==0, 'OverTime_Yes'] , label = 'Attrition == 0')
sns.kdeplot(data.loc[data['Attrition']==1, 'OverTime_Yes'] , label = 'Attrition == 1')
plt.xlabel('OverTime_Yes')
plt.grid()
plt.show()

从图中可以看到，对于已经辞职的人来说加班与否其实没有太大影响；对于尚在工作的员工来说，不加班就不辞职

In [None]:
# 构造剩下的影响因素数据
ext_data = data[['Attrition','NumCompaniesWorked', 'DistanceFromHome', 'OverTime_Yes']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
# 采用热力图展示各个指标的相关性
plt.figure(figsize=(10,8))
sns.heatmap(ext_data_corrs,cmap = plt.cm.RdYlBu_r,annot=True) # heatmap
plt.show()

In [None]:
plt.figure(figsize = (10,8))
for i,source in enumerate (['NumCompaniesWorked', 'DistanceFromHome', 'OverTime_Yes']):
    # 指定子图的位置
    plt.subplot(3,1,i+1) # 3行1列，依次向下排序
    # KDEplot
    sns.kdeplot(data.loc[data['Attrition']==0,source],label='Attrition==0')
    sns.kdeplot(data.loc[data['Attrition']==1,source],label='Attrition==1')
    plt.title('D of %s' % source)
    plt.grid()
plt.tight_layout(h_pad=2.5)
plt.show()

还有更多的EDA分析在另一篇EDA分析中

### 数据预处理

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 首先先对最初的数据进行处理
label = data['Attrition']
train = data.drop (columns = ['Attrition']) # 删除原始数据集data的标签‘Attrition’

train,test,y_train,y_test = train_test_split(train, label, test_size = 0.3, random_state = 0)  # 随机种子设为0，保证每次传回的值都是一样的
features = list(train.columns) # 所有的列名做一个list表格形式

imputer = SimpleImputer(strategy = 'median') # 拿中位数填充
std = StandardScaler() # 标准化模块

# 缺失值填充
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

# 数据标准化
std.fit(train)
train = std.transform(train)
test = std.transform(test)

In [None]:
test.shape

#### 基础模型：逻辑回归

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C=0.0001)  # C为正则化惩罚的力度 
log_reg.fit(train,y_train)

In [None]:
# 预测结果，并返回评估指标
predictions = log_reg.predict_proba(test)[:,1] # 来得到[0，1]的一个预测指标, 0为最偏离， 1为最接近
predictions[:5] # 此时得到的为概率值

In [None]:
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(y_test,predictions) # y_test为标签，predictions为测试出来的结果
test_auc # 通过这个预测结果的准确度来评估结果准不准，越接近1越准

### 机器学习算法对比

具体的算法对比在另一篇里面