In [None]:
!pip install pyspark

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName('healthcare').getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
appointmentdf=spark.read.csv('/content/drive/MyDrive/All files/Datasets/KaggleV2-May-2016.csv',inferSchema=True,header=True)

In [None]:
appointmentdf.describe().show(5)

+-------+--------------------+-----------------+------+------------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------+
|summary|           PatientId|    AppointmentID|Gender|               Age|Neighbourhood|        Scholarship|       Hipertension|           Diabetes|          Alcoholism|             Handcap|       SMS_received|No-show|
+-------+--------------------+-----------------+------+------------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------+
|  count|              110527|           110527|110527|            110527|       110527|             110527|             110527|             110527|              110527|              110527|             110527| 110527|
|   mean|1.474962657103946...|5675305.123426855|  NULL| 37.08887421173107|         NULL|0.09826558216544373| 0.1972459218109

In [None]:
appointmentdf.columns

['PatientId',
 'AppointmentID',
 'Gender',
 'ScheduledDay',
 'AppointmentDay',
 'Age',
 'Neighbourhood',
 'Scholarship',
 'Hipertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'No-show']

In [None]:
appointmentdf.dtypes

[('PatientId', 'double'),
 ('AppointmentID', 'int'),
 ('Gender', 'string'),
 ('ScheduledDay', 'timestamp'),
 ('AppointmentDay', 'timestamp'),
 ('Age', 'int'),
 ('Neighbourhood', 'string'),
 ('Scholarship', 'int'),
 ('Hipertension', 'int'),
 ('Diabetes', 'int'),
 ('Alcoholism', 'int'),
 ('Handcap', 'int'),
 ('SMS_received', 'int'),
 ('No-show', 'string')]

In [None]:
appointmentdf.groupBy('No-show').count().show()

+-------+-----+
|No-show|count|
+-------+-----+
|     No|88208|
|    Yes|22319|
+-------+-----+



In [None]:
appointmentdf.select('Age').describe().show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|            110527|
|   mean| 37.08887421173107|
| stddev|23.110204963682584|
|    min|                -1|
|    max|               115|
+-------+------------------+



In [None]:
from pyspark.sql.functions import skewness,kurtosis

In [None]:
appointmentdf.select(skewness('Age'),kurtosis('Age')).show()

+-------------------+------------------+
|      skewness(Age)|     kurtosis(Age)|
+-------------------+------------------+
|0.12165636682687768|-0.952278601592428|
+-------------------+------------------+



In [None]:
appointmentdf.groupBy('No-show').mean('Age').show()

+-------+------------------+
|No-show|          avg(Age)|
+-------+------------------+
|     No|37.790064393252315|
|    Yes| 34.31766656212196|
+-------+------------------+



In [None]:
appointmentdf.crosstab('No-show','SMS_received').show()

+--------------------+-----+-----+
|No-show_SMS_received|    0|    1|
+--------------------+-----+-----+
|                  No|62510|25698|
|                 Yes|12535| 9784|
+--------------------+-----+-----+



In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [None]:
gender_dummy=StringIndexer(inputCol='Gender',outputCol='genderdummy')
neighbor_dummy=StringIndexer(inputCol='Neighbourhood',outputCol='neighbordummy')
scholar_dummy=StringIndexer(inputCol='Scholarship',outputCol='scholardummy')
bp_dummy=StringIndexer(inputCol='Hipertension',outputCol='bpdummy')
sugar_dummy=StringIndexer(inputCol='Diabetes',outputCol='sugardummy')
alcohol_dummy=StringIndexer(inputCol='Alcoholism',outputCol="alcoholdummy")
handicap_dummy=StringIndexer(inputCol='Handcap',outputCol='handicapdummy')
sms_dummy=StringIndexer(inputCol='SMS_received',outputCol='smsdummy')
noshow_dummy=StringIndexer(inputCol='No-show',outputCol='noshowdummy')

In [None]:
pipeline=Pipeline(stages=[gender_dummy,neighbor_dummy,scholar_dummy,bp_dummy,sugar_dummy,alcohol_dummy,handicap_dummy,sms_dummy,noshow_dummy])

In [None]:
appointmentRDD=pipeline.fit(appointmentdf).transform(appointmentdf)

In [None]:
appointmentRDD.describe().show(5)

+-------+--------------------+-----------------+------+------------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+
|summary|           PatientId|    AppointmentID|Gender|               Age|Neighbourhood|        Scholarship|       Hipertension|           Diabetes|          Alcoholism|             Handcap|       SMS_received|No-show|        genderdummy|     neighbordummy|       scholardummy|            bpdummy|         sugardummy|        alcoholdummy|       handicapdummy|           smsdummy|        noshowdummy|
+-------+--------------------+-----------------+------+------------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+----------

In [None]:
appointmentRDD.columns

['PatientId',
 'AppointmentID',
 'Gender',
 'ScheduledDay',
 'AppointmentDay',
 'Age',
 'Neighbourhood',
 'Scholarship',
 'Hipertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'No-show',
 'genderdummy',
 'neighbordummy',
 'scholardummy',
 'bpdummy',
 'sugardummy',
 'alcoholdummy',
 'handicapdummy',
 'smsdummy',
 'noshowdummy']

In [None]:
colsdrop=['PatientId','AppointmentID','Gender','ScheduledDay','AppointmentDay','Neighbourhood','Scholarship','Hipertension','Diabetes','Alcoholism', 'Handcap',
          'SMS_received', 'No-show']

In [None]:
appointmentRDDDF=appointmentRDD.drop(*colsdrop)

In [None]:
appointmentRDDDF.show(6)

+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
|Age|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
| 62|        0.0|          3.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        1.0|          3.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 62|        0.0|         49.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
|  8|        0.0|         75.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        0.0|          3.0|         0.0|    1.0|       1.0|         0.0|          0.0|     0.0|        0.0|
| 76|        0.0|         43.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|     

In [None]:
from pyspark.ml.feature import RFormula

In [None]:
formula=RFormula(formula='noshowdummy~.',featuresCol='features',labelCol='label')

In [None]:
appointmentRDD=formula.fit(appointmentRDDDF).transform(appointmentRDDDF)

In [None]:
appointmentRDD.select('features','label').show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(9,[0,2,4],[62.0,...|  0.0|
|(9,[0,1,2],[56.0,...|  0.0|
|(9,[0,2],[62.0,49...|  0.0|
|(9,[0,2],[8.0,75.0])|  0.0|
|(9,[0,2,4,5],[56....|  0.0|
+--------------------+-----+
only showing top 5 rows



In [None]:
appointmentRDDDF.show(5)

+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
|Age|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+
| 62|        0.0|          3.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        1.0|          3.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 62|        0.0|         49.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
|  8|        0.0|         75.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|
| 56|        0.0|          3.0|         0.0|    1.0|       1.0|         0.0|          0.0|     0.0|        0.0|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [None]:
logit=LogisticRegression(featuresCol='features',labelCol='label')

In [None]:
logitmodel=logit.fit(appointmentRDD)

In [None]:
logitmodel.summary.accuracy

0.798067440534892

In [None]:
logitmodel.summary.areaUnderROC

0.5964373847872473

In [None]:
logitpredict=logitmodel.transform(appointmentRDD)

In [None]:
logitpredict.show()

+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+--------------------+-----+--------------------+--------------------+----------+
|Age|genderdummy|neighbordummy|scholardummy|bpdummy|sugardummy|alcoholdummy|handicapdummy|smsdummy|noshowdummy|            features|label|       rawPrediction|         probability|prediction|
+---+-----------+-------------+------------+-------+----------+------------+-------------+--------+-----------+--------------------+-----+--------------------+--------------------+----------+
| 62|        0.0|          3.0|         0.0|    1.0|       0.0|         0.0|          0.0|     0.0|        0.0|(9,[0,2,4],[62.0,...|  0.0|[1.84923515524927...|[0.86403727644512...|       0.0|
| 56|        1.0|          3.0|         0.0|    0.0|       0.0|         0.0|          0.0|     0.0|        0.0|(9,[0,1,2],[56.0,...|  0.0|[1.76093057588088...|[0.85332616995024...|       0.0|
| 62|        0.0|         49.0|         

In [None]:
accuracy=MulticlassClassificationEvaluator(metricName='accuracy')
auc=BinaryClassificationEvaluator()

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
tree=DecisionTreeClassifier(maxBins=81)

In [None]:
treemodel=tree.fit(appointmentRDD)

In [None]:
treepredict=treemodel.transform(appointmentRDD)

In [None]:
accuracy.evaluate(treepredict)

0.798067440534892

In [None]:
auc.evaluate(treepredict)

0.5

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(maxBins=81)

In [None]:
rfmodel=rf.fit(appointmentRDD)

In [None]:
rfpredict=rfmodel.transform(appointmentRDD)

In [None]:
accuracy.evaluate(rfpredict)

0.798067440534892

In [None]:
auc.evaluate(rfpredict)

0.6048981627477866

In [None]:
from pyspark.ml.classification import GBTClassifier

In [None]:
gbm=GBTClassifier(maxBins=81)

In [None]:
gbmmodel=gbm.fit(appointmentRDD)

In [None]:
gbmpredict=gbmmodel.transform(appointmentRDD)

In [None]:
accuracy.evaluate(gbmpredict)

0.7986193418802645

In [None]:
auc.evaluate(gbmpredict)

0.6442166014137939

In [None]:
from pyspark.ml.classification import LinearSVC

In [None]:
svc=LinearSVC()

In [None]:
svcmodel=svc.fit(appointmentRDD)

In [None]:
svcpredict=svcmodel.transform(appointmentRDD)

In [None]:
accuracy.evaluate(svcpredict)

0.798067440534892

In [None]:
auc.evaluate(svcpredict)

0.5407340830926192

# Summary

| Model Name | Accuracy | ROC-AUC |
|--|--|--|
| Logistic Regression | 0.798067440534892 | 0.5964327533383069 |
| Decision Tree |0.798067440534892 |0.5 |
| Random Forest |0.798067440534892|0.6047476048470436 |
| **Gradient Boosting Machine** | **0.7986012467541868** | **0.6454879968792955**|
| Support Vector Machine | 0.798067440534892 |0.540733973376347|

From the Accuracy and AUC we say that Gradient Boosting is best model because it giving best accuracy and ROC-AUC scores