###HR Data Analysis 

###ARUNPRASATH P 

Objective: To predict if an valuable employee is going to resign or not 

The dataset contain 35 variables along with Attrition variable 

Attribute Description 

Attribute Name : Definition 

Satisfaction Level : Employee Satisfaction (can be interpreted as a %) 

Last evaluation : Employee Evaluation (can be interpreted as a %) 

Projects : Number of Projects (per year) 

Average monthly hours : Average monthly hours 

Time spent at company : Time spent at company 

Accident : Whether they have had a work accident 

Promotion Last 5 yrs : Whether they have had a promotion in the last 5 years 

Positions : Type of Job Position 

Salary : Salary level (1= low, 2= medium, 3= high) 

Left : Whether the employee has left (0= remains employed, 1= left) 



In [66]:
#import pyspark libraries
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc=SparkContext.getOrCreate()

In [67]:
#import the dataset
data = spark.read.csv('HR_comma_sep.csv',inferSchema=True,header=True)

In [68]:
#type of data
type(data)

pyspark.sql.dataframe.DataFrame

In [10]:
#first 10 rows in dataset
data.show(20)

+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----+------+
|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years|sales|salary|
+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----+------+
|              0.38|           0.53|             2|                 157|                 3|            0|   1|                    0|sales|   low|
|               0.8|           0.86|             5|                 262|                 6|            0|   1|                    0|sales|medium|
|              0.11|           0.88|             7|                 272|                 4|            0|   1|                    0|sales|medium|
|              0.72|           0.87|             5|                 223|                 5|            0|   1|              

In [5]:
#to describe the types of column in dataframe
data.printSchema()

root
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- sales: string (nullable = true)
 |-- salary: string (nullable = true)



In [12]:
#columns in dataset
data.columns

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'sales',
 'salary']

In [14]:
#no of rows in dataset
data.count()

14999

In [16]:
#summary statistics of numerical columns
data.describe().show(5)

+-------+-------------------+-------------------+------------------+--------------------+------------------+-------------------+-------------------+---------------------+---------+------+
|summary| satisfaction_level|    last_evaluation|    number_project|average_montly_hours|time_spend_company|      Work_accident|               left|promotion_last_5years|    sales|salary|
+-------+-------------------+-------------------+------------------+--------------------+------------------+-------------------+-------------------+---------------------+---------+------+
|  count|              14999|              14999|             14999|               14999|             14999|              14999|              14999|                14999|    14999| 14999|
|   mean| 0.6128335222348166| 0.7161017401159978|  3.80305353690246|   201.0503366891126| 3.498233215547703| 0.1446096406427095| 0.2380825388359224| 0.021268084538969265|     null|  null|
| stddev|0.24863065106114257|0.17116911062327556|1.232592355

In [20]:
#unique data from sales column
data[["sales"]].distinct().show()

+-----------+
|      sales|
+-----------+
|      sales|
| accounting|
|         hr|
|  technical|
|    support|
| management|
|         IT|
|product_mng|
|  marketing|
|      RandD|
+-----------+



In [23]:
#view satisfaction_level,sales,salary columns
data.select("satisfaction_level","sales","salary").show(15)

+------------------+-----+------+
|satisfaction_level|sales|salary|
+------------------+-----+------+
|              0.38|sales|   low|
|               0.8|sales|medium|
|              0.11|sales|medium|
|              0.72|sales|   low|
|              0.37|sales|   low|
|              0.41|sales|   low|
|               0.1|sales|   low|
|              0.92|sales|   low|
|              0.89|sales|   low|
|              0.42|sales|   low|
|              0.45|sales|   low|
|              0.11|sales|   low|
|              0.84|sales|   low|
|              0.41|sales|   low|
|              0.36|sales|   low|
+------------------+-----+------+
only showing top 15 rows



In [29]:
#rename a column sales to department
data = data.withColumnRenamed('sales','department')

In [30]:
#view column names
data.columns

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'department',
 'salary']

In [27]:
#to convert string to numbers using Featurization
import pyspark.ml.feature as ft
transformer_dept = ft.StringIndexer(inputCol='department', outputCol='department_en')
transformer_salary = ft.StringIndexer(inputCol='salary', outputCol='salary_en')

In [36]:
#to convert all numerical data to vector
fc = ft.VectorAssembler(inputCols=['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years','department_en','salary_en'], outputCol='features')

In [33]:
#import pyspark ml libraries
import pyspark.ml.classification as cl

In [34]:
#to create an estimator
logisticreg = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='left')

In [35]:
#import pipeline libraries
from pyspark.ml import Pipeline

In [38]:
#create pipeline connecting 3 transformers & one estimator
pipeline = Pipeline(stages=[transformer_dept, 
                            transformer_salary, 
                            fc,
                            logisticreg])

In [39]:
#to split data into train and test data
train_data,test_data = data.randomSplit([0.7,0.3],seed=100)


In [43]:
#to view train_data
train_data.show()

+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+
|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years| department|salary|
+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+
|              0.09|           0.62|             6|                 294|                 4|            0|   1|                    0| accounting|   low|
|              0.09|           0.62|             6|                 294|                 4|            0|   1|                    0| accounting|   low|
|              0.09|           0.62|             6|                 294|                 4|            0|   1|                    0| accounting|   low|
|              0.09|           0.77|             5|                 275|                

In [42]:
#to viewa test_data
test_data.show()

+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+
|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years| department|salary|
+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+
|              0.09|           0.77|             5|                 275|                 4|            0|   1|                    0|product_mng|medium|
|              0.09|           0.77|             6|                 290|                 4|            0|   1|                    0|  technical|medium|
|              0.09|           0.77|             6|                 290|                 4|            0|   1|                    0|  technical|medium|
|              0.09|           0.78|             6|                 244|                

In [44]:
#create a model with train_data
model = pipeline.fit(train_data)

In [45]:
#evaluating with test_data
test_eval= model.transform(test_data)

In [47]:
test_eval.columns

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'department',
 'salary',
 'department_en',
 'salary_en',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [48]:
test_eval.select('features','rawPrediction','probability','prediction').show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[0.09,0.77,5.0,27...|[-0.5642488331120...|[0.36256493283259...|       1.0|
|[0.09,0.77,6.0,29...|[-0.3707105211441...|[0.40836934577251...|       1.0|
|[0.09,0.77,6.0,29...|[-0.3707105211441...|[0.40836934577251...|       1.0|
|[0.09,0.78,6.0,24...|[-0.7996797558882...|[0.31009402636043...|       1.0|
|[0.09,0.78,6.0,25...|[-0.8540535805701...|[0.29858321842602...|       1.0|
|[0.09,0.78,6.0,26...|[-0.9040579133600...|[0.28821730929698...|       1.0|
|[0.09,0.78,7.0,29...|[-0.6454855407091...|[0.34400758028789...|       1.0|
|[0.09,0.79,6.0,29...|[-1.2937691804654...|[0.21521552121688...|       1.0|
|[0.09,0.8,6.0,247...|[-0.2116437008529...|[0.44728569760350...|       1.0|
|[0.09,0.8,6.0,301...|[-0.6574596752154...|[0.34131049092922...|       1.0|
|[0.09,0.8,6

In [64]:
#to Evaluate the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',labelCol='left')

In [65]:
#Accuracy
evaluator.evaluate(test_eval)

0.8218704304578666