In [89]:
import tensorflow as tf
import numpy as np
import pandas as pd
import io
import requests
import math
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
%matplotlib inline

In [90]:
def feature_normalize(dataset):
    '''
    特征均一化
    '''
    mu = np.mean(dataset,axis=0)
    sigma = np.std(dataset,axis=0)
    return (dataset - mu)/sigma

def str_to_int(df):
    str_columns = df.select_dtypes(['object']).columns
    print(str_columns)
    for col in str_columns:
        df[col] = df[col].astype('category')

    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

def count_space_except_nan(x):
    if isinstance(x,str):
        return x.count(" ") + 1
    else :
        return 0
    
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
    return df

In [91]:
df_train = pd.read_csv('/Users/qianzecheng/Downloads/contest/train.csv')
df_test = pd.read_csv('/Users/qianzecheng/Downloads/contest/test.csv')

In [92]:
print (df_train.isnull().sum())
delete_columns = ["Unnamed: 0","Name", "YearofBirth", "X", "Over18", "EmployeeCount", "StandardHours", "EmployeeNumber"]   # waiting to be modified


Unnamed: 0                   0
X                            0
Name                         0
JobID                        0
YearofBirth                  0
AttritionStatus              0
DistanceFromHome            58
JobLevel                     0
WorkLifeBalance              0
PercentSalaryHike            0
HourlyRate                   0
YearsWithCurrManager         0
MonthlyRate                  0
JobSatisfaction              0
Over18                       0
EnvironmentSatisfaction      0
YearsSinceLastPromotion      0
JobRole                      0
EducationLevel               0
YearsAtCompany               0
Gender                       0
Travel_For_Business          0
EducationField               0
MaritalStatus                0
PerformanceRating            0
MonthlyIncome                0
StandardHours                0
Age                          0
TotalWorkingYears            0
EmployeeNumber               0
LastYearTrainingTime        43
RelationshipSatisfaction     0
Employee

## Modified

In [93]:
df_train

Unnamed: 0.1,Unnamed: 0,X,Name,JobID,YearofBirth,AttritionStatus,DistanceFromHome,JobLevel,WorkLifeBalance,PercentSalaryHike,...,LastYearTrainingTime,RelationshipSatisfaction,EmployeeCount,OverTime,Department,YearsInCurrentRole,JobInvolvement,StockOptionLevel,NumCompaniesWorked,DailyRate
0,1,1,"Willis, Isaiah",1001,1975,No,1.0,2,3,15,...,5.0,4,1,No,Research & Development,0,4,1,2,170
1,2,2,"el-Amber, Furqaan",1002,1988,No,5.0,2,3,19,...,3.0,3,1,No,Sales,2,3,1,1,1003
2,3,3,"Haley, Aundrea",1003,1987,No,3.0,3,3,15,...,6.0,4,1,No,Research & Development,7,3,0,3,1018
3,4,4,"Roodani, Rakshmai",1004,1979,No,16.0,3,2,15,...,2.0,4,1,Yes,Research & Development,2,3,0,6,448
4,5,5,"Adame Saenz, Jorden",1005,1995,No,1.0,2,2,12,...,2.0,2,1,No,Research & Development,3,2,0,1,771
5,6,6,"al-Pashia, Tammaam",1006,1971,No,6.0,2,3,13,...,,4,1,No,Research & Development,2,2,0,2,277
6,7,7,"Jennings, Mylas",1007,1970,No,18.0,5,3,17,...,5.0,3,1,Yes,Research & Development,9,2,1,1,1245
7,8,8,"Maunu, Safawn",1008,1997,Yes,8.0,1,3,17,...,6.0,3,1,Yes,Research & Development,0,3,0,1,1294
8,9,9,"Elliott, Jaymie",1009,1984,Yes,25.0,1,2,19,...,3.0,1,1,Yes,Research & Development,2,3,1,1,130
9,10,10,"Morgan, Alexandra",1010,1988,Yes,1.0,1,4,11,...,2.0,1,1,Yes,Sales,2,3,0,1,1060


In [94]:
def pre_processing(df):
    df.drop(delete_columns, axis=1, inplace=True)
    # Count room nubmer
    # df_train["Cabin"] = df_train["Cabin"].apply(count_space_except_nan)
    # Replace NaN with mean value
    df["DistanceFromHome"].fillna(df["DistanceFromHome"].mean(), inplace=True)
    df["LastYearTrainingTime"].fillna(df["LastYearTrainingTime"].mean(), inplace=True)
    # EducationLevel, Embarked one-hot
    # df = one_hot(df, df.loc[:, ["EducationLevel"]].columns)
    # df = one_hot(df, df.loc[:, ["EducationField"]].columns)
    # df = one_hot(df, df.loc[:, ["Travel_For_Business"]].columns)
    # df = one_hot(df, df.loc[:, ["MaritalStatus"]].columns)
    # df = one_hot(df, df.loc[:, ["EnvironmentSatisfaction"]].columns)
    # df = one_hot(df, df.loc[:, ["JobInvolvement"]].columns)
    # df = one_hot(df, df.loc[:, ["JobSatisfaction"]].columns)
    # df = one_hot(df, df.loc[:, ["PerformanceRating"]].columns)
    # df = one_hot(df, df.loc[:, ["RelationshipSatisfaction"]].columns)
    # df = one_hot(df, df.loc[:, ["WorkLifeBalance"]].columns)
    # df = one_hot(df, df.loc[:, ["JobLevel"]].columns)
    # df = one_hot(df, df.loc[:, ["JobRole"]].columns)
    # df = one_hot(df, df.loc[:, ["StockOptionLevel"]].columns)
    
    
    
    # String to int
    df = str_to_int(df)
    # Age Normalization
    
    df["EducationLevel"] = feature_normalize(df["EducationLevel"])
    df["EducationField"] = feature_normalize(df["EducationField"])
    df["Travel_For_Business"] = feature_normalize(df["Travel_For_Business"])
    df["MaritalStatus"] = feature_normalize(df["MaritalStatus"])
    df["EnvironmentSatisfaction"] = feature_normalize(df["EnvironmentSatisfaction"])
    df["JobInvolvement"] = feature_normalize(df["JobInvolvement"])
    # df["JobSatisfaction"] = feature_normalize(df["JobSatisfaction"])
    df["RelationshipSatisfaction"] = feature_normalize(df["RelationshipSatisfaction"])
    # df["WorkLifeBalance"] = feature_normalize(df["WorkLifeBalance"])
    df["JobLevel"] = feature_normalize(df["JobLevel"])
    df["JobRole"] = feature_normalize(df["JobRole"])
    df["StockOptionLevel"] = feature_normalize(df["StockOptionLevel"])
    
    
    df["DailyRate"] = feature_normalize(df["DailyRate"])
    df["HourlyRate"] = feature_normalize(df["HourlyRate"])
    df["MonthlyRate"] = feature_normalize(df["MonthlyRate"])
    df["Age"] = feature_normalize(df["Age"])
    df["DistanceFromHome"] = feature_normalize(df["DistanceFromHome"])
    df["PercentSalaryHike"] = feature_normalize(df["PercentSalaryHike"])
    df["YearsAtCompany"] = feature_normalize(df["YearsAtCompany"])
    #df["YearsWithCurrManager"] = feature_normalize(df["YearsWithCurrManager"])
    df["YearsSinceLastPromotion"] = feature_normalize(df["YearsSinceLastPromotion"])
    df["MonthlyIncome"] = feature_normalize(df["MonthlyIncome"])
    df["TotalWorkingYears"] = feature_normalize(df["TotalWorkingYears"])
    df["YearsInCurrentRole"] = feature_normalize(df["YearsInCurrentRole"])
    df["NumCompaniesWorked"] = feature_normalize(df["NumCompaniesWorked"])
    
    # df["EmployeeNumber"] = feature_normalize(df["EmployeeNumber"]) 
    # stats.describe(df).variance
    return df

In [95]:
df_train = pre_processing(df_train)
df_test = pre_processing(df_test)

Index(['AttritionStatus', 'JobSatisfaction', 'JobRole', 'Gender',
       'Travel_For_Business', 'EducationField', 'MaritalStatus',
       'PerformanceRating', 'OverTime', 'Department'],
      dtype='object')
Index(['JobSatisfaction', 'JobRole', 'Gender', 'Travel_For_Business',
       'EducationField', 'MaritalStatus', 'PerformanceRating', 'OverTime',
       'Department'],
      dtype='object')


In [100]:
df_train["JobSatisfaction"]
#df_train["YearsWithCurrManager"]

0       0
1       2
2       3
3       3
4       2
5       2
6       2
7       0
8       1
9       1
10      3
11      0
12      2
13      2
14      0
15      3
16      1
17      1
18      2
19      3
20      1
21      3
22      0
23      0
24      3
25      3
26      1
27      2
28      3
29      2
       ..
1070    2
1071    1
1072    3
1073    3
1074    2
1075    3
1076    2
1077    0
1078    3
1079    0
1080    1
1081    2
1082    3
1083    2
1084    2
1085    3
1086    1
1087    3
1088    1
1089    3
1090    3
1091    2
1092    1
1093    2
1094    3
1095    1
1096    0
1097    3
1098    1
1099    0
Name: JobSatisfaction, Length: 1100, dtype: int8

In [42]:
count_female = sum(df_train_g)
count_male = len(df_train_g) - count_female

In [54]:
count_female_attrition = 0
count_male_attrition = 0
for i in range(len(df_train_g)):
    if(df_train_g[i] == 1 and df_train_a[i] == 1):
        count_female_attrition += 1
    if(df_train_g[i] == 0 and df_train_a[i] == 1):
        count_male_attrition += 1
        

In [60]:
print("Male total count: ", count_male)
print("Male attrition total count: ", count_male_attrition)
print("Male attrition rate: ", count_male_attrition/count_male)
print("\n")
print("Female total count: ", count_female)
print("Female attrition total count: ", count_female_attrition)
print("Female attrition rate: ", count_female_attrition/count_female)

Male total count:  427
Male attrition total count:  65
Male attrition rate:  0.1522248243559719


Female total count:  673
Female attrition total count:  114
Female attrition rate:  0.16939078751857356


In [317]:
df_train_w
l = [0,0,0,0,]
for i in range(len(df_train_p)):
    l[df_train_w[i]-1] += df_train_p[i]
    

In [318]:
l

[10, 40, 109, 17]

In [None]:
plt.hist(l, bins=40, normed=0, facecolor="blue", edgecolor="black", alpha=0.7)

In [102]:
df_train.to_csv('new.csv', index=False)

In [237]:
features = df_train.iloc[:, 2:].values

In [238]:
labels = df_train.iloc[:, 1:2].values # 1100*1

In [239]:
print(features.shape, labels.shape)

(1100, 75) (1100, 1)


In [240]:
rnd_indices = np.random.rand(len(features)) < 0.80 # margin waiting to be modified

In [241]:
df_train

Unnamed: 0,JobID,AttritionStatus,DistanceFromHome,PercentSalaryHike,HourlyRate,YearsWithCurrManager,MonthlyRate,YearsSinceLastPromotion,YearsAtCompany,Gender,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,StockOptionLevel_0,StockOptionLevel_1,StockOptionLevel_2,StockOptionLevel_3
0,1001,0,-1.038944e+00,-0.069534,0.560637,-0.616283,-0.688288,-0.066695,-0.828789,1,...,0,0,0,0,0,0,0,1,0,0
1,1002,0,-5.305712e-01,1.000218,-0.764651,-0.616283,0.935880,-0.673015,-0.350892,1,...,0,0,0,0,1,0,0,1,0,0
2,1003,0,-7.847575e-01,-0.069534,-1.353667,0.758234,0.715658,1.449104,-0.032294,0,...,0,0,1,0,0,0,1,0,0,0
3,1004,0,8.674534e-01,-0.069534,0.855145,-0.341380,0.796313,0.236465,-0.510191,0,...,0,1,0,0,0,0,1,0,0,0
4,1005,0,-1.038944e+00,-0.871848,-1.059159,-0.616283,-0.021172,-0.369855,-0.510191,1,...,0,0,0,0,0,0,1,0,0,0
5,1006,0,-4.034780e-01,-0.604410,1.493246,-0.616283,-0.161581,-0.066695,-0.828789,1,...,0,0,0,0,0,0,1,0,0,0
6,1007,0,1.121640e+00,0.465342,-0.421058,1.308040,-1.703692,-0.673015,3.790880,1,...,0,0,1,0,0,0,0,1,0,0
7,1008,1,-1.492918e-01,0.465342,0.609721,-1.166090,0.242393,-0.673015,-0.988088,0,...,0,0,0,0,0,0,1,0,0,0
8,1009,1,2.011292e+00,1.000218,1.444162,1.033137,0.328377,1.449104,0.445603,0,...,0,0,0,1,0,0,0,1,0,0
9,1010,1,-1.038944e+00,-1.139286,-0.617397,-0.616283,-0.834869,-0.066695,-0.669490,0,...,0,0,0,0,0,1,1,0,0,0


In [242]:
real_test_x = df_test.iloc[:, 1:].values
train_x = features[rnd_indices]
train_y = labels[rnd_indices]
test_x = features[~rnd_indices]
test_JobID = df_train[~rnd_indices]["JobID"]
test_y = labels[~rnd_indices]

feature_count = train_x.shape[1]
label_count = train_y.shape[1]
print(feature_count, label_count)

75 1


In [249]:
df_test.shape

(370, 76)

In [168]:
train_y.sum()

144

## x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2)

In [135]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(872, 62)
(872, 1)
(228, 62)
(228, 1)


In [136]:
train_y.shape

(872, 1)

In [137]:
train_x.shape

(872, 62)

In [138]:
sm = SMOTE(random_state=2)
train_x, train_y = sm.fit_sample(train_x, train_y.ravel())

In [141]:
train_y.sum()

733

In [32]:
# inputs
training_epochs = 10000
learning_rate = 1e-8
hidden_layers = feature_count - 1
cost_history = np.empty(shape=[1],dtype=float)
test_history = np.empty(shape=[1],dtype=float)

X = tf.placeholder(tf.float32,[None,feature_count])
Y = tf.placeholder(tf.float32,[None,label_count])
is_training=tf.Variable(True,dtype=tf.bool)

In [33]:
# models

initializer = tf.contrib.layers.xavier_initializer()
h0 = tf.layers.dense(X, hidden_layers, activation=tf.nn.relu, kernel_initializer=initializer)
# h0 = tf.nn.dropout(h0, 0.95)
h1 = tf.layers.dense(h0, label_count, activation=None)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=h1)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# prediction = tf.argmax(h0, 1)
# correct_prediction = tf.equal(prediction, tf.argmax(Y_one_hot, 1))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

predicted = tf.nn.sigmoid(h1)
correct_pred = tf.equal(tf.round(predicted), Y)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [34]:
# session

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(training_epochs + 1):
        sess.run(optimizer, feed_dict={X: train_x, Y: train_y})
        loss, _, acc = sess.run([cost, optimizer, accuracy], feed_dict={
                                 X: train_x, Y: train_y})
        cost_history = np.append(cost_history, acc)
        if step % 500 == 0:
            print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(
                step, loss, acc))
            
        if step % 200 == 0:
            acc, tt= sess.run([accuracy, tf.round(predicted)], feed_dict={X: test_x, Y: test_y})
            test_history = np.append(test_history, acc)
            # if step % 400 == 0:
            #    print("Testing:   Step: {:5}\tAcc: {:.2%}".format(
            #        step, acc))
            
    # Test model and check accuracy
    print('Test Accuracy:', sess.run([accuracy, tf.round(predicted)], feed_dict={X: test_x, Y: test_y}))
    
    # Save test result
    test_predict_result = sess.run(tf.cast(tf.round(predicted), tf.int32), feed_dict={X: test_x})
    evaluation = test_JobID.to_frame()
    evaluation["Attrition"] = test_predict_result
    evaluation.to_csv('result.csv', index=False)

Step:     0	Loss: 77.159	Acc: 84.26%
Step:   500	Loss: 77.079	Acc: 84.26%
Step:  1000	Loss: 76.998	Acc: 84.26%
Step:  1500	Loss: 76.918	Acc: 84.26%
Step:  2000	Loss: 76.837	Acc: 84.26%
Step:  2500	Loss: 76.757	Acc: 84.26%
Step:  3000	Loss: 76.676	Acc: 84.26%
Step:  3500	Loss: 76.596	Acc: 84.26%
Step:  4000	Loss: 76.515	Acc: 84.26%
Step:  4500	Loss: 76.435	Acc: 84.26%
Step:  5000	Loss: 76.354	Acc: 84.26%
Step:  5500	Loss: 76.273	Acc: 84.26%
Step:  6000	Loss: 76.193	Acc: 84.26%
Step:  6500	Loss: 76.112	Acc: 84.26%
Step:  7000	Loss: 76.032	Acc: 84.26%
Step:  7500	Loss: 75.951	Acc: 84.26%
Step:  8000	Loss: 75.871	Acc: 84.26%
Step:  8500	Loss: 75.790	Acc: 84.26%
Step:  9000	Loss: 75.709	Acc: 84.26%
Step:  9500	Loss: 75.628	Acc: 84.26%
Step: 10000	Loss: 75.548	Acc: 84.26%
Test Accuracy: [0.8156682, array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
 

In [35]:
cost_history = list(cost_history[1:])
test_history = list(test_history[1:])
f_train = open("train.txt", "w")
f_test = open("test.txt", "w")

for num in cost_history:
    f_train.write(str(num))
    f_train.write('\n')
    
for num in test_history:
    f_test.write(str(num))
    f_test.write('\n')

In [52]:
from imblearn.over_sampling import RandomOverSampler

ModuleNotFoundError: No module named 'imblearn'

In [115]:
a = np.array([1,2,3])

In [120]:
a = a.reshape(a.shape[0], 1)

In [121]:
a.shape

(3, 1)