In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv("HR_comma_sep.csv.txt")

In [3]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
data.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
sales                    0
salary                   0
dtype: int64

##### No null values present

In [6]:
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [7]:
data.number_project.value_counts()

4    4365
3    4055
5    2761
2    2388
6    1174
7     256
Name: number_project, dtype: int64

In [8]:
data.time_spend_company.value_counts()

3     6443
2     3244
4     2557
5     1473
6      718
10     214
7      188
8      162
Name: time_spend_company, dtype: int64

## Handling numerical values 
Normalising using MinMaxScaler

In [10]:
from sklearn.preprocessing import MinMaxScaler
df1=data.copy()
scaler=MinMaxScaler()
df1[['satisfaction_level', 'last_evaluation','number_project','average_montly_hours', 'time_spend_company']]=scaler.fit_transform(df1[['satisfaction_level', 'last_evaluation','number_project','average_montly_hours', 'time_spend_company']])

In [12]:
df1

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.318681,0.265625,0.0,0.285047,0.125,0,1,0,sales,low
1,0.780220,0.781250,0.6,0.775701,0.500,0,1,0,sales,medium
2,0.021978,0.812500,1.0,0.822430,0.250,0,1,0,sales,medium
3,0.692308,0.796875,0.6,0.593458,0.375,0,1,0,sales,low
4,0.307692,0.250000,0.0,0.294393,0.125,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.340659,0.328125,0.0,0.257009,0.125,0,1,0,support,low
14995,0.307692,0.187500,0.0,0.299065,0.125,0,1,0,support,low
14996,0.307692,0.265625,0.0,0.219626,0.125,0,1,0,support,low
14997,0.021978,0.937500,0.8,0.859813,0.250,0,1,0,support,low


In [13]:
df1.sales.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

## Handling categorical values( nominal data)- "sales"
Using pandas get_dummies fn

In [31]:
dummies=pd.get_dummies(df1.sales)

In [32]:
dummies

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
14994,0,0,0,0,0,0,0,0,1,0
14995,0,0,0,0,0,0,0,0,1,0
14996,0,0,0,0,0,0,0,0,1,0
14997,0,0,0,0,0,0,0,0,1,0


#### Drop "technical" from dummies due to get dummies trap

In [42]:
df2=pd.concat([df1.drop("sales",axis=1),dummies.drop("technical",axis=1)],axis=1)

In [43]:
df2.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

In [44]:
df2["salary"].sort_values()

10309      high
4665       high
11099      high
8875       high
8878       high
          ...  
3528     medium
3529     medium
3530     medium
3532     medium
7499     medium
Name: salary, Length: 14999, dtype: object

### Encoding "salary" (ordinal data)

In [45]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df3=df2.copy()
df3["salary"]=encoder.fit_transform(df3["salary"].sort_values())

In [37]:
df3

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.318681,0.265625,0.0,0.285047,0.125,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,0.780220,0.781250,0.6,0.775701,0.500,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,0.021978,0.812500,1.0,0.822430,0.250,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,0.692308,0.796875,0.6,0.593458,0.375,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,0.307692,0.250000,0.0,0.294393,0.125,0,1,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.340659,0.328125,0.0,0.257009,0.125,0,1,0,2,0,0,0,0,0,0,0,0,1,0
14995,0.307692,0.187500,0.0,0.299065,0.125,0,1,0,2,0,0,0,0,0,0,0,0,1,0
14996,0.307692,0.265625,0.0,0.219626,0.125,0,1,0,2,0,0,0,0,0,0,0,0,1,0
14997,0.021978,0.937500,0.8,0.859813,0.250,0,1,0,2,0,0,0,0,0,0,0,0,1,0


In [46]:
df3.salary.unique()

array([0, 1, 2])

In [48]:
df4

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support
0,0.318681,0.265625,0.0,0.285047,0.125,0,1,0,0,0,0,0,0,0,0,0,1,0
1,0.780220,0.781250,0.6,0.775701,0.500,0,1,0,0,0,0,0,0,0,0,0,1,0
2,0.021978,0.812500,1.0,0.822430,0.250,0,1,0,0,0,0,0,0,0,0,0,1,0
3,0.692308,0.796875,0.6,0.593458,0.375,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0.307692,0.250000,0.0,0.294393,0.125,0,1,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.340659,0.328125,0.0,0.257009,0.125,0,1,0,2,0,0,0,0,0,0,0,0,1
14995,0.307692,0.187500,0.0,0.299065,0.125,0,1,0,2,0,0,0,0,0,0,0,0,1
14996,0.307692,0.265625,0.0,0.219626,0.125,0,1,0,2,0,0,0,0,0,0,0,0,1
14997,0.021978,0.937500,0.8,0.859813,0.250,0,1,0,2,0,0,0,0,0,0,0,0,1


## Split df as x and y

In [26]:
x=df4.drop("left",axis=1)
y=df4.left

In [27]:
x

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,0.318681,0.265625,0.0,0.285047,0.125,0,0,7,0
1,0.780220,0.781250,0.6,0.775701,0.500,0,0,7,0
2,0.021978,0.812500,1.0,0.822430,0.250,0,0,7,0
3,0.692308,0.796875,0.6,0.593458,0.375,0,0,7,0
4,0.307692,0.250000,0.0,0.294393,0.125,0,0,7,0
...,...,...,...,...,...,...,...,...,...
14994,0.340659,0.328125,0.0,0.257009,0.125,0,0,8,2
14995,0.307692,0.187500,0.0,0.299065,0.125,0,0,8,2
14996,0.307692,0.265625,0.0,0.219626,0.125,0,0,8,2
14997,0.021978,0.937500,0.8,0.859813,0.250,0,0,8,2


## Split x and y for train and test using train_test_split 

In [59]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=42)

## Using logistic regression for prediction

In [60]:
from sklearn.linear_model import LogisticRegression
reg=LogisticRegression()
reg.fit(x_train,y_train)
reg.score(x_test,y_test)


0.7963333333333333

In [53]:
from sklearn.model_selection import cross_val_score
cross_val_score(reg,x,y,cv=10)

array([0.88066667, 0.856     , 0.86466667, 0.80466667, 0.79133333,
       0.76933333, 0.76533333, 0.76066667, 0.72      , 0.71247498])

In [54]:
pred=reg.predict(x_test)