In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the csv file, df stands for dataFrame.
df = pd.read_csv('HR_comma_sep.csv')
df.head()

In [None]:
y = df['left']
df.drop('left', axis=1, inplace=True)

In [None]:
print("data size: ", df.shape[0])
print("column size: ", df.shape[1])
print(df.columns)

In [138]:
# Notice that there's type called object.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 9 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 1.0+ MB


In [139]:
# Only those columns which are numeric are meaningful.
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
satisfaction_level,14999.0,0.612834,0.248631,0.09,0.44,0.64,0.82,1.0
last_evaluation,14999.0,0.716102,0.171169,0.36,0.56,0.72,0.87,1.0
number_project,14999.0,3.803054,1.232592,2.0,3.0,4.0,5.0,7.0
average_montly_hours,14999.0,201.050337,49.943099,96.0,156.0,200.0,245.0,310.0
time_spend_company,14999.0,3.498233,1.460136,2.0,3.0,3.0,4.0,10.0
Work_accident,14999.0,0.14461,0.351719,0.0,0.0,0.0,0.0,1.0
promotion_last_5years,14999.0,0.021268,0.144281,0.0,0.0,0.0,0.0,1.0


In [140]:
# Checking that if there's any nas, since we have to handle it if nas exists.
df.isnull().any()

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool

In [141]:
# The data in the object type is non-numeric.
print("sales: ", df['sales'].unique())
print("salary: ", df['salary'].unique())

sales:  ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
 'product_mng' 'marketing' 'RandD']
salary:  ['low' 'medium' 'high']


---

In [142]:
# First, let's pre-process the two objects first by
# giving it orders and doing the one-hot encoding respectiviely to salary & sales.
# Here I use drop_first since all zeros in the dummies columns means the first type.
df.salary.replace({'low':1,'medium':2,'high':3},inplace=True)
df = pd.get_dummies(df, drop_first=True)

In [143]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,0,2,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,0,2,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,1,0,0


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 17 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
promotion_last_5years    14999 non-null int64
salary                   14999 non-null int64
sales_RandD              14999 non-null uint8
sales_accounting         14999 non-null uint8
sales_hr                 14999 non-null uint8
sales_management         14999 non-null uint8
sales_marketing          14999 non-null uint8
sales_product_mng        14999 non-null uint8
sales_sales              14999 non-null uint8
sales_support            14999 non-null uint8
sales_technical          14999 non-null uint8
dtypes: float64(2), int64(6), uint8(9)
memory usage: 1.0 MB


In [127]:
# iloc: indexing via integers
df.iloc[0:5, 0:8]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
0,0.38,0.53,2,157,3,0,1,0
1,0.8,0.86,5,262,6,0,1,0
2,0.11,0.88,7,272,4,0,1,0
3,0.72,0.87,5,223,5,0,1,0
4,0.37,0.52,2,159,3,0,1,0


---

In [115]:
# Second, let's pre-process the numerical data with normalization.
df = (df - df.mean()) / (df.max() - df.min())

In [116]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_low,salary_medium
0,-0.255861,-0.290784,-0.360611,-0.205843,-0.062279,-0.14461,0.761917,-0.021268,-0.05247,-0.051137,-0.04927,-0.042003,-0.057204,-0.060137,0.723982,-0.14861,-0.181345,0.512234,-0.429762
1,0.205677,0.224841,0.239389,0.284812,0.312721,-0.14461,0.761917,-0.021268,-0.05247,-0.051137,-0.04927,-0.042003,-0.057204,-0.060137,0.723982,-0.14861,-0.181345,-0.487766,0.570238
2,-0.552564,0.256091,0.639389,0.33154,0.062721,-0.14461,0.761917,-0.021268,-0.05247,-0.051137,-0.04927,-0.042003,-0.057204,-0.060137,0.723982,-0.14861,-0.181345,-0.487766,0.570238
3,0.117765,0.240466,0.239389,0.102569,0.187721,-0.14461,0.761917,-0.021268,-0.05247,-0.051137,-0.04927,-0.042003,-0.057204,-0.060137,0.723982,-0.14861,-0.181345,0.512234,-0.429762
4,-0.26685,-0.306409,-0.360611,-0.196497,-0.062279,-0.14461,0.761917,-0.021268,-0.05247,-0.051137,-0.04927,-0.042003,-0.057204,-0.060137,0.723982,-0.14861,-0.181345,0.512234,-0.429762


---

In [148]:
# Let's see the corrolation of each columns
df.corr().head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical
satisfaction_level,1.0,0.105021,-0.14297,-0.020048,-0.100866,0.058697,0.025605,0.050022,0.006615,-0.028649,-0.012841,0.007172,0.005715,0.006919,0.004007,0.009185,-0.009345
last_evaluation,0.105021,1.0,0.349333,0.339742,0.131591,-0.007104,-0.008684,-0.013002,-0.005471,0.002193,-0.009645,0.009662,-0.000311,-0.001989,-0.023031,0.017104,0.013742
number_project,-0.14297,0.349333,1.0,0.417211,0.196786,-0.004741,-0.006064,-0.001803,0.009703,0.004189,-0.027356,0.009728,-0.023064,0.000829,-0.013388,0.000303,0.028596
average_montly_hours,-0.020048,0.339742,0.417211,1.0,0.127755,-0.010143,-0.003544,-0.002242,-0.001177,0.000524,-0.010783,0.000834,-0.00821,-0.005494,-0.001718,-0.002444,0.013638
time_spend_company,-0.100866,0.131591,0.196786,0.127755,1.0,0.00212,0.067433,0.048715,-0.021116,0.003909,-0.022194,0.115436,0.012096,-0.003919,0.01515,-0.030111,-0.027991


In [None]:
# After the pre-processing, we can start training out model.
# Try a linear model first!
