In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=61a85c4b8f45162bcb024cd3e136a7669b45adbb3123941d952bf2a3925a2f13
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
spark=SparkSession.builder.appName("classification").getOrCreate()

In [6]:
hrdata=spark.read.csv("/content/drive/MyDrive/train_hranalytics.csv",inferSchema=True,header=True)

In [18]:
hrdata.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- region: string (nullable = true)
 |-- education: string (nullable = false)
 |-- gender: string (nullable = true)
 |-- recruitment_channel: string (nullable = true)
 |-- no_of_trainings: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- previous_year_rating: integer (nullable = true)
 |-- length_of_service: integer (nullable = true)
 |-- KPIs_met >80%: integer (nullable = true)
 |-- awards_won?: integer (nullable = true)
 |-- avg_training_score: integer (nullable = true)
 |-- is_promoted: integer (nullable = true)



In [8]:
hrdata.groupBy('education').count().show()

+----------------+-----+
|       education|count|
+----------------+-----+
|            null| 2409|
| Below Secondary|  805|
|Master's & above|14925|
|      Bachelor's|36669|
+----------------+-----+



In [9]:
hrdata=hrdata.na.fill(value="Bachelor's",subset=['education'])

In [12]:
hrdata.groupBy('previous_year_rating').count().show()

+--------------------+-----+
|previous_year_rating|count|
+--------------------+-----+
|                   1| 6223|
|                   3|22742|
|                   5|11741|
|                   4| 9877|
|                   2| 4225|
+--------------------+-----+



In [11]:
hrdata=hrdata.na.fill(value=3,subset=['previous_year_rating'])

In [15]:
hrdata.columns

['employee_id',
 'department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'is_promoted']

In [16]:
from pyspark.ml.feature import StringIndexer

In [17]:
from pyspark.ml import Pipeline

In [19]:
indexer=[StringIndexer(inputCol=col,outputCol=col+"index").fit(hrdata)
for col in list(set(hrdata.columns)-set(['no_of_trainings','age','length_of_service','avg_training_score','employee_id']))]

In [20]:
pipeline=Pipeline(stages=indexer)

In [21]:
hrdata=pipeline.fit(hrdata).transform(hrdata)

In [22]:
hrdata.columns

['employee_id',
 'department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'is_promoted',
 'previous_year_ratingindex',
 'educationindex',
 'genderindex',
 'recruitment_channelindex',
 'awards_won?index',
 'is_promotedindex',
 'regionindex',
 'departmentindex',
 'KPIs_met >80%index']

In [23]:
columnstodrop=['employee_id','department','region','education','gender','recruitment_channel',
               'previous_year_rating','KPIs_met >80%','awards_won?','is_promoted',]

In [24]:
hrdata=hrdata.drop(*columnstodrop)

In [25]:
from pyspark.ml.feature import RFormula

In [26]:
formula=RFormula(formula="is_promotedindex~.",featuresCol='features',labelCol='label')

In [27]:
hrdata=formula.fit(hrdata).transform(hrdata)

In [28]:
hrdata.select('features','label').show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,35.0,8.0,49....|  0.0|
|[1.0,30.0,4.0,60....|  0.0|
|(12,[0,1,2,3,7,9]...|  0.0|
|(12,[0,1,2,3,4,9]...|  0.0|
|(12,[0,1,2,3,9,10...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [43]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator

In [36]:
logit=LogisticRegression()

In [37]:
logitmodel=logit.fit(hrdata)

In [38]:
logitmodel.summary.accuracy

0.9209057071960298

In [39]:
logitmodel.summary.areaUnderROC

0.8074795905654398

In [40]:
accuracy=MulticlassClassificationEvaluator(metricName="accuracy")

In [41]:
hrdata.columns

['no_of_trainings',
 'age',
 'length_of_service',
 'avg_training_score',
 'previous_year_ratingindex',
 'educationindex',
 'genderindex',
 'recruitment_channelindex',
 'awards_won?index',
 'is_promotedindex',
 'regionindex',
 'departmentindex',
 'KPIs_met >80%index',
 'features',
 'label']

In [44]:
auc=BinaryClassificationEvaluator()

In [45]:
from pyspark.ml.classification import DecisionTreeClassifier

In [46]:
tree=DecisionTreeClassifier(maxBins=35)

In [47]:
treemodel=tree.fit(hrdata)

In [48]:
treepredict=treemodel.transform(hrdata)

In [49]:
accuracy.evaluate(treepredict)

0.9275835644431469

In [50]:
auc.evaluate(treepredict)

0.5802576543176963

In [51]:
from pyspark.ml.classification import RandomForestClassifier

In [52]:
RF=RandomForestClassifier(maxBins=35)

In [53]:
RFmodel=RF.fit(hrdata)

In [54]:
RFpredict=RFmodel.transform(hrdata)

In [55]:
accuracy.evaluate(RFpredict)

0.9263428696540651

In [56]:
auc.evaluate(RFpredict)

0.8321735473151608

In [57]:
from pyspark.ml.classification import GBTClassifier

In [58]:
gbm=GBTClassifier(maxBins=35)

In [59]:
gbmmodel=gbm.fit(hrdata)

In [60]:
gbmpredict=gbmmodel.transform(hrdata)

In [61]:
accuracy.evaluate(gbmpredict)

0.9417420814479638

In [62]:
auc.evaluate(gbmpredict)

0.9160726764545135