# Logistic Regression Model

In [None]:
#SparkSession is the entry point to Spark SQL. It is the very first object 
#to create while developing Spark SQL applications.
#Used the SparkSession.builder method to create an instance of SparkSession with appName('employee')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('employee').getOrCreate()

In [None]:
# Read the data from the HR_comma_sep.csv file into df dataframe.
df = spark.read.csv ('HR_comma_sep.csv', inferSchema=True, header =True)

In [None]:
#It displays the schema of the dataframe df
df.printSchema()

In [None]:
#Importing the module StringIndexer from subpackage ml.feature
from pyspark.ml.feature import StringIndexer

In [None]:
#In first step by using StringIndexer function we are creating an output label 'sal_label' with input 'salary', as we need integer values for performing logistic regression
#We are transforming the above result into indexed dataframe and showing the first 10 results.
indexer = StringIndexer(inputCol='salary', outputCol='sal_label')
indexed = indexer.fit(df).transform(df)
indexed.head(10)

In [None]:
#Importing the modules Vectors & VectorAssembler from subpackage ml.linalg & ml.feature
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
#Displays the columns in the indexed dataframe
indexed.columns

In [None]:
#By using VectorAsssembler creating features output column with input columns including everything except label column
#left and sales column which is not much required for this prediction.
assembler = VectorAssembler(inputCols=[ 'satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years',
 'sal_label'], outputCol='features')

In [None]:
#Transforming the indexed dataframe to the dataframe output
output = assembler.transform(indexed)

In [None]:
#Selecting only required columns or features into final data dataframe
final_data = output.select(['features', 'left'])

In [None]:
#Splitting the actual data into traindata & test data which is of 70% & 30%
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [None]:
#Importing the LogisticRegression module from the classification subpackage
from pyspark.ml.classification import LogisticRegression

In [None]:
#Selecting the label coulmn for this problem as 'left' and applying on LogisticRegression function 
#and assigining to Employ_left dataframe.
Employ_left=LogisticRegression(labelCol='left')

In [None]:
#Trying to fit the regression model on training data
fitted_left_model = Employ_left.fit(train_data)

In [None]:
#Applying the summary module on the fitted modeldataframe 
train_summary = fitted_left_model.summary

In [None]:
#Show the prediction of actual label and prediction label
train_summary.predictions.describe().show()

In [None]:
#Importing BinaryClassificationEvaluator for evaluating the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
#tTrying to fit the model on test data and storing it to pred_and_labels dataframe for measuring the accuracy of thet model
pred_and_labels = fitted_left_model.evaluate(test_data)

In [None]:
#It shows the prediction,probability along with the actual value.
pred_and_labels.predictions.show()

In [None]:
#Applying the Evaluator on labelcoulmn and storing it into left_eval dataframe.
left_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='left')

In [None]:
#Evaluating the default metric 'AreaUnderROC"
auc = left_eval.evaluate(pred_and_labels.predictions)

In [None]:
#Showing the accuracy of 64%(0.6471)
auc