## Importing in libraries and dataset

In [1]:
import pandas as pd
import numpy as np

raw_data = pd.read_csv('HR_Employee.csv')

raw_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,0.38,0.53,2,157,3,0,0,sales,low,1
1,0.8,0.86,5,262,6,0,0,sales,medium,1
2,0.11,0.88,7,272,4,0,0,sales,medium,1
3,0.72,0.87,5,223,5,0,0,sales,low,1
4,0.37,0.52,2,159,3,0,0,sales,low,1


## Copy dataframe so not to edit the original dataframe

In [2]:
df = raw_data.copy()

## Checking info on dataframe

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
left                     14999 non-null int64
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


## Removing unwanted values and encoding categorical datas

In [4]:
df['sales'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [5]:
#dropping rows with value 'RandD'
df = df[df.sales != 'RandD']

In [6]:
df['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [7]:
#mapping 'low' to 1, 'medium' to 2 and 'high' to 3
df['salary'] = df['salary'].map({'low':1,'medium':2,'high':3})

## Get dummies from column 'sales' while dropping the first column

In [8]:
sales_columns = pd.get_dummies(df['sales'], drop_first = True)
df = df.drop(['sales'], axis=1)

df_with_dummies = pd.concat([df, sales_columns], axis=1)

In [9]:
df_with_dummies.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.38,0.53,2,157,3,0,0,1,1,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,0,2,1,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,0,2,1,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,0,1,1,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,0,1,1,0,0,0,0,0,1,0,0


## Reorder columns such that the target (column 'left') is the most right

In [10]:
#get all the column names
df_with_dummies.columns.values

array(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'left', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'sales', 'support',
       'technical'], dtype=object)

In [11]:
# rearrange column names in the order we want
column_names_reordered = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'sales', 'support',
       'technical', 'left']

df_reordered = df_with_dummies[column_names_reordered]

In [12]:
df_reordered.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,accounting,hr,management,marketing,product_mng,sales,support,technical,left
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,1,0,0,1
1,0.8,0.86,5,262,6,0,0,2,0,0,0,0,0,1,0,0,1
2,0.11,0.88,7,272,4,0,0,2,0,0,0,0,0,1,0,0,1
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,1,0,0,1
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,1,0,0,1


## Exporting cleaned dataframe as csv file

In [13]:
df_reordered.to_csv('HR_employee_cleaned.csv',index=False)