In [None]:
#SparkSession is the entry point to Spark SQL. It is the very first object 
#to create while developing Spark SQL applications.
#Used the SparkSession.builder method to create an instance of SparkSession with appName('employee')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('employee').getOrCreate()

In [None]:
# Read the data from the HR_comma_sep.csv file into df dataframe.
df = spark.read.csv ('HR_comma_sep.csv', inferSchema=True, header =True)

In [None]:
#It displays the schema of the dataframe df
df.printSchema()

In [None]:
#Importing the module StringIndexer from subpackage ml.feature
from pyspark.ml.feature import StringIndexer

In [None]:
#In first step by using StringIndexer function we are creating an output label 'sal_label' with input 'salary', as we need integer values for performing logistic regression
#We are transforming the above result into indexed dataframe and showing the first 10 results.
indexer = StringIndexer(inputCol='salary', outputCol='sal_label')
indexed = indexer.fit(df).transform(df)
indexed.head(10)

In [None]:
#Importing the modules Vectors & VectorAssembler from subpackage ml.linalg & ml.feature
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
#Displays the columns in the indexed dataframe
indexed.columns

In [None]:
#By using VectorAsssembler creating features output column with input columns including everything except label column
#left and sales column which is not much required for this prediction.
assembler = VectorAssembler(inputCols=[ 'satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years',
 'sal_label'], outputCol='features')

In [None]:
#Transforming the indexed dataframe to the dataframe df1
df1 = assembler.transform(indexed)

In [None]:
#In RandomForest algorithm we need to explicitly declare the label column with which one we have to predict.
labelIndexer = StringIndexer().setInputCol("left").setOutputCol("label")

In [None]:
#Fitting and trasforming df1 dataframe into df2 dataframe.
df2 = labelIndexer.fit(df1).transform(df1)

In [None]:
#Splitting the actual data into traindata & test data which is of 70% & 30%
trainingData, testData = df2.randomSplit([0.7, 0.3])

In [None]:
#Importing required methods like RandomForestClassifer , Evaluator etc
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [None]:
#Creating the randomforestclassifer with the below hyper-parameters
#Gini Impurity, Maximum tree depth of 20, 5 trees in the forest and random number seed of 5043
classifier = RandomForestClassifier().setImpurity("gini").setMaxDepth(20).setNumTrees(5).setFeatureSubsetStrategy("auto").setSeed(5043)

In [None]:
#Trying to fit the classifier on training data.
model = classifier.fit(trainingData)

In [None]:
#Predict diagnoses for the testing data
predictions = model.transform(testData)

In [None]:
#Shows the actual value & predicticted value along with other feature co-relation
predictions.select("satisfaction_level", "label", "prediction").show(5)

In [None]:
#Evaluating the model with MultiClassification Evaluator with 'Accuracy' metric
evaluator = MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy")
accuracy = evaluator.evaluate(predictions)

In [None]:
#Showing the accuracy of 98%
accuracy

In [None]:
#Evaluating the model with BinaryClassification Evaluator with 'prediction' metric
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='label')

In [None]:
#Showing the accuracy of ~98%
evaluator.evaluate(predictions)

# Comparing the models & predicting which technique works better.
I have used two classification models i.e. 'Logistic Regression' & 'Radom Forest Classifer'.
For Logistic regression the accuracy is 64%
For Random forest classifier  the accuracy is 98%

Hence I can say by comparing the obtained accuracy, Random forest classifer technique works better.