In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
raw = pd.read_csv('train.csv')

In [3]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 36 columns):
user_id                     1176 non-null int64
Age                         1176 non-null int64
Attrition                   1176 non-null object
BusinessTravel              1176 non-null object
DailyRate                   1176 non-null int64
Department                  1176 non-null object
DistanceFromHome            1176 non-null int64
Education                   1176 non-null int64
EducationField              1176 non-null object
EmployeeCount               1176 non-null int64
EmployeeNumber              1176 non-null int64
EnvironmentSatisfaction     1176 non-null int64
Gender                      1176 non-null object
HourlyRate                  1176 non-null int64
JobInvolvement              1176 non-null int64
JobLevel                    1176 non-null int64
JobRole                     1176 non-null object
JobSatisfaction             1176 non-null int64
MaritalStatus          

In [4]:
# delete IDs and attributes with consistant values
raw.drop(columns=['user_id','EmployeeNumber','EmployeeCount','Over18','StandardHours'],inplace=True)

In [5]:
# transfer data
def attrition_new(status):
    if status=='No':
        return 0
    else:
        return 1
    
raw['Attrition']=raw['Attrition'].astype(str).apply(attrition_new)

In [6]:
def BusinessTravel_new(status):
    if status=='Non-Travel':
        return 0
    elif status=='Travel_Rarely':
        return 1
    else:
        return 2
    
raw['BusinessTravel']=raw['BusinessTravel'].astype(str).apply(BusinessTravel_new)

In [7]:
def overtime_new(status):
    if status=='No':
        return 0
    else:
        return 1
    
raw['OverTime']=raw['OverTime'].astype(str).apply(overtime_new)

In [8]:
# convert categories to dummies data
convert_to_dummies=['Department','EducationField','Gender','JobRole','MaritalStatus']

In [9]:
raw = pd.get_dummies(raw, prefix=convert_to_dummies, columns=convert_to_dummies,drop_first=True)

In [10]:
x = raw.drop('Attrition', axis=1)
y = raw['Attrition']

In [11]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20, random_state=33)

In [12]:
ss = preprocessing.StandardScaler()
train_ss_x = ss.fit_transform(train_x)
test_ss_x = ss.transform(test_x)

In [13]:
# Method 1:  decision tree
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(train_ss_x, train_y)
predict_y = clf.predict(test_ss_x)
print('LR准确率: %0.4lf' % accuracy_score(predict_y, test_y))

LR准确率: 0.7712


In [14]:
# Feature importance
features = list(train_x.columns)
importance = sorted(zip(features, clf.feature_importances_), key=lambda x:x[1], reverse=True)
for feature, val in importance:
    print('{0:10s} | {1:.5f}'.format(feature, val))

TotalWorkingYears | 0.09715
MonthlyIncome | 0.08788
HourlyRate | 0.08605
TrainingTimesLastYear | 0.05957
DistanceFromHome | 0.05693
MaritalStatus_Single | 0.04831
EnvironmentSatisfaction | 0.04791
DailyRate  | 0.04755
MonthlyRate | 0.04328
BusinessTravel | 0.04301
Age        | 0.04128
OverTime   | 0.03487
YearsAtCompany | 0.03256
RelationshipSatisfaction | 0.03192
YearsSinceLastPromotion | 0.02980
StockOptionLevel | 0.02943
Department_Sales | 0.02941
NumCompaniesWorked | 0.02813
WorkLifeBalance | 0.01584
EducationField_Medical | 0.01548
JobInvolvement | 0.01372
JobSatisfaction | 0.01184
Gender_Male | 0.01165
MaritalStatus_Married | 0.01009
JobRole_Sales Executive | 0.00888
JobRole_Laboratory Technician | 0.00885
EducationField_Technical Degree | 0.00721
Education  | 0.00592
YearsInCurrentRole | 0.00592
Department_Research & Development | 0.00388
JobRole_Sales Representative | 0.00330
EducationField_Marketing | 0.00175
YearsWithCurrManager | 0.00038
PercentSalaryHike | 0.00025
JobLevel 

In [15]:
# Method 2: Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(train_ss_x, train_y)
predict_y2 = rfc.predict(test_ss_x)

print('RF准确率: %0.4lf' % accuracy_score(predict_y2, test_y))

RF准确率: 0.8686


In [16]:
kaggle_test = pd.read_csv('test.csv')
kaggle_test.drop(columns=['EmployeeNumber','EmployeeCount','Over18','StandardHours'],inplace=True)
kaggle_test['BusinessTravel']=kaggle_test['BusinessTravel'].astype(str).apply(BusinessTravel_new)
kaggle_test['BusinessTravel']=kaggle_test['BusinessTravel'].astype(str).apply(BusinessTravel_new)
kaggle_test['OverTime']=kaggle_test['OverTime'].astype(str).apply(overtime_new)
kaggle_test = pd.get_dummies(kaggle_test, prefix=convert_to_dummies, columns=convert_to_dummies,drop_first=True)

In [17]:
kaggle_test_x = kaggle_test.drop('user_id', axis=1)
kaggle_test_ss_x = ss.transform(kaggle_test_x)

In [18]:
predict_kaggle = rfc.predict(kaggle_test_ss_x)

In [19]:
pd.DataFrame(rfc.predict_proba(kaggle_test_ss_x))[1]

0      0.100000
1      0.110000
2      0.148333
3      0.161667
4      0.711667
         ...   
289    0.093333
290    0.140000
291    0.241667
292    0.181667
293    0.218333
Name: 1, Length: 294, dtype: float64

In [20]:
result = pd.concat([kaggle_test['user_id'], pd.DataFrame(rfc.predict_proba(kaggle_test_ss_x))[1]], axis=1)

In [21]:
result.rename(columns={ 1: "Attrition"},inplace=True)

In [22]:
result

Unnamed: 0,user_id,Attrition
0,442,0.100000
1,1091,0.110000
2,981,0.148333
3,785,0.161667
4,1332,0.711667
...,...,...
289,1439,0.093333
290,481,0.140000
291,124,0.241667
292,198,0.181667


In [23]:
result.to_csv('Employee_Attrition.csv',index=False)