In [2]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
#spark = sqlContext

In [3]:
hr_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('F:/JupyterML/ML_Practice/datasets/HR_attrition.csv')

In [4]:
hr_df.show(2)

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+------------------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|           JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|Ye

In [5]:
hr_df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string 

In [6]:
hr_df.select("Attrition").show(5)

+---------+
|Attrition|
+---------+
|      Yes|
|       No|
|      Yes|
|       No|
|       No|
+---------+
only showing top 5 rows



In [7]:
hr_df =hr_df.withColumn("Attrition_flag",when(col('Attrition')== 'No',0).otherwise(1))

In [8]:
hr_df.select("Attrition_flag").show(12)

+--------------+
|Attrition_flag|
+--------------+
|             1|
|             0|
|             1|
|             0|
|             0|
|             0|
|             0|
|             0|
|             0|
|             0|
|             0|
|             0|
+--------------+
only showing top 12 rows



In [9]:
drop_list = ['Attrition']

hr_df = hr_df.select([column for column in hr_df.columns if column not in drop_list])

In [10]:
hr_df.dtypes

[('Age', 'int'),
 ('BusinessTravel', 'string'),
 ('DailyRate', 'int'),
 ('Department', 'string'),
 ('DistanceFromHome', 'int'),
 ('Education', 'int'),
 ('EducationField', 'string'),
 ('EmployeeCount', 'int'),
 ('EmployeeNumber', 'int'),
 ('EnvironmentSatisfaction', 'int'),
 ('Gender', 'string'),
 ('HourlyRate', 'int'),
 ('JobInvolvement', 'int'),
 ('JobLevel', 'int'),
 ('JobRole', 'string'),
 ('JobSatisfaction', 'int'),
 ('MaritalStatus', 'string'),
 ('MonthlyIncome', 'int'),
 ('MonthlyRate', 'int'),
 ('NumCompaniesWorked', 'int'),
 ('Over18', 'string'),
 ('OverTime', 'string'),
 ('PercentSalaryHike', 'int'),
 ('PerformanceRating', 'int'),
 ('RelationshipSatisfaction', 'int'),
 ('StandardHours', 'int'),
 ('StockOptionLevel', 'int'),
 ('TotalWorkingYears', 'int'),
 ('TrainingTimesLastYear', 'int'),
 ('WorkLifeBalance', 'int'),
 ('YearsAtCompany', 'int'),
 ('YearsInCurrentRole', 'int'),
 ('YearsSinceLastPromotion', 'int'),
 ('YearsWithCurrManager', 'int'),
 ('Attrition_flag', 'int')]

In [11]:
(traindf, testdf) = hr_df.randomSplit([0.7,0.3],seed =42)

In [12]:
print(traindf.count())
print(testdf.count())

1020
450


In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer,VectorIndexer,OneHotEncoder, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import *

In [14]:
cat_features = [t[0] for t in hr_df.dtypes if t[1] =='string']
num_features = [t[0] for t in hr_df.dtypes if t[1] =='int']

In [15]:
cat_features

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [16]:
num_features

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'Attrition_flag']

In [17]:
hr_df.select(cat_features).show(4)

+-----------------+--------------------+--------------+------+--------------------+-------------+------+--------+
|   BusinessTravel|          Department|EducationField|Gender|             JobRole|MaritalStatus|Over18|OverTime|
+-----------------+--------------------+--------------+------+--------------------+-------------+------+--------+
|    Travel_Rarely|               Sales| Life Sciences|Female|     Sales Executive|       Single|     Y|     Yes|
|Travel_Frequently|Research & Develo...| Life Sciences|  Male|  Research Scientist|      Married|     Y|      No|
|    Travel_Rarely|Research & Develo...|         Other|  Male|Laboratory Techni...|       Single|     Y|     Yes|
|Travel_Frequently|Research & Develo...| Life Sciences|Female|  Research Scientist|      Married|     Y|     Yes|
+-----------------+--------------------+--------------+------+--------------------+-------------+------+--------+
only showing top 4 rows



In [18]:
hr_df.select("JobRole").distinct().show()

+--------------------+
|             JobRole|
+--------------------+
|     Sales Executive|
|Manufacturing Dir...|
|Laboratory Techni...|
|Sales Representative|
|Healthcare Repres...|
|  Research Scientist|
|             Manager|
|   Research Director|
|     Human Resources|
+--------------------+



In [19]:
hr_df.select("BusinessTravel").distinct().show()

+-----------------+
|   BusinessTravel|
+-----------------+
|Travel_Frequently|
|       Non-Travel|
|    Travel_Rarely|
+-----------------+



In [20]:
hr_df.select("Department").distinct().show()

+--------------------+
|          Department|
+--------------------+
|               Sales|
|Research & Develo...|
|     Human Resources|
+--------------------+



In [21]:
cat_features

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [22]:
btIndexer = StringIndexer(inputCol='BusinessTravel', outputCol='indexedBT')
DepartmentIndexer = StringIndexer(inputCol='Department', outputCol='indexedDepartment')
EducationFieldIndexer = StringIndexer(inputCol='EducationField', outputCol='indexedEducationField')
GenderIndexer = StringIndexer(inputCol='Gender', outputCol='indexedGender')
JobRoleIndexer = StringIndexer(inputCol='JobRole', outputCol='indexedJobRole')
MaritalStatusIndexer = StringIndexer(inputCol='MaritalStatus', outputCol='indexedMaritalStatus')
Over18Indexer = StringIndexer(inputCol='Over18', outputCol='indexedOver18')
OverTimeIndexer = StringIndexer(inputCol='OverTime', outputCol='indexedOverTime')                                 

In [23]:
###ONEHOT ENCODER
btEncoder = OneHotEncoder(dropLast=True, inputCol="indexedBT", outputCol="BTvec")
deptEncoder = OneHotEncoder(dropLast=True, inputCol="indexedDepartment", outputCol="Departmentvec")
jobroleEcoder = OneHotEncoder(dropLast=True, inputCol="indexedJobRole", outputCol="JobRolevec")
eduEcoder = OneHotEncoder(dropLast=True, inputCol='indexedEducationField', outputCol="eduvec")


In [24]:
input_col = ['Age','DailyRate','DistanceFromHome','Education','EmployeeCount','EmployeeNumber','EnvironmentSatisfaction','HourlyRate','JobInvolvement','JobLevel','JobSatisfaction','MonthlyIncome','MonthlyRate','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StandardHours','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager','Attrition_flag','indexedMaritalStatus','indexedOver18','indexedOverTime','indexedGender','BTvec','Departmentvec','JobRolevec','eduvec']

In [25]:
assembler = VectorAssembler(inputCols=input_col,outputCol="features")

In [28]:
pipeline_features=[btIndexer,DepartmentIndexer,EducationFieldIndexer,GenderIndexer,JobRoleIndexer,MaritalStatusIndexer,Over18Indexer,OverTimeIndexer,btEncoder,deptEncoder,jobroleEcoder,eduEcoder,assembler,rf]

In [29]:
rf = RandomForestClassifier(labelCol="Attrition_flag", featuresCol="features")

In [30]:
pipeline = Pipeline(stages=pipeline_features)

In [31]:
model = pipeline.fit(traindf)

In [32]:
predictions = model.transform(testdf)

In [33]:
predictions.columns

['Age',
 'BusinessTravel',
 'DailyRate',
 'Department',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'Attrition_flag',
 'indexedBT',
 'indexedDepartment',
 'indexedEducationField',
 'indexedGender',
 'indexedJobRole',
 'indexedMaritalStatus',
 'indexedOver18',
 'indexedOverTime',
 'BTvec',
 'Departmentvec',
 'JobRolevec',
 'eduvec',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [34]:
predictions.select('Attrition_flag','rawPrediction','probability','prediction').show(5,False)

+--------------+---------------------------------------+----------------------------------------+----------+
|Attrition_flag|rawPrediction                          |probability                             |prediction|
+--------------+---------------------------------------+----------------------------------------+----------+
|0             |[13.472627891397648,6.527372108602352] |[0.6736313945698824,0.3263686054301176] |0.0       |
|0             |[16.282813076582833,3.717186923417167] |[0.8141406538291417,0.18585934617085836]|0.0       |
|0             |[13.472627891397648,6.527372108602352] |[0.6736313945698824,0.3263686054301176] |0.0       |
|1             |[3.1115573267133536,16.888442673286647]|[0.15557786633566767,0.8444221336643324]|1.0       |
|0             |[14.159779543744547,5.840220456255453] |[0.7079889771872273,0.2920110228127727] |0.0       |
+--------------+---------------------------------------+----------------------------------------+----------+
only showing top 5 

In [35]:
pred = predictions.select(col("Attrition_flag").cast("Float"),col("prediction"))

In [36]:
pred.show(3)

+--------------+----------+
|Attrition_flag|prediction|
+--------------+----------+
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
+--------------+----------+
only showing top 3 rows



In [37]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Attrition_flag', metricName='accuracy')

In [38]:
accuracy = evaluator.evaluate(pred)

In [39]:
print("Accuracy = %g" % accuracy)

Accuracy = 0.968889


In [40]:
evaluatorf1 = MulticlassClassificationEvaluator(labelCol="Attrition_flag", predictionCol="prediction", metricName="f1")
f1 = evaluatorf1.evaluate(pred)
print("f1 = %g" % f1)

f1 = 0.967413


In [41]:
evaluatorwp = MulticlassClassificationEvaluator(labelCol="Attrition_flag", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(pred)
print("wp = %g" % wp)

wp = 0.969992


In [42]:
evaluatorwr = MulticlassClassificationEvaluator(labelCol="Attrition_flag", predictionCol="prediction", metricName="weightedrecall")
wr = evaluatorwp.evaluate(pred)
print("wr = %g" % wr)

wr = 0.969992
