# Prediction of potential risk of coronary heart disease

In [134]:
from pyspark import Row

from pyspark.sql import SparkSession
from pyspark.sql.functions import corr

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier,\
                                      RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("heart_disease").getOrCreate()

df = spark.read.csv(path="data_cardiovascular_risk.csv", inferSchema=True, header=True)

df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- education: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- is_smoking: string (nullable = true)
 |-- cigsPerDay: double (nullable = true)
 |-- BPMeds: double (nullable = true)
 |-- prevalentStroke: integer (nullable = true)
 |-- prevalentHyp: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- totChol: double (nullable = true)
 |-- sysBP: double (nullable = true)
 |-- diaBP: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- heartRate: double (nullable = true)
 |-- glucose: double (nullable = true)
 |-- TenYearCHD: integer (nullable = true)



### Count rows before drop null rows

In [135]:
print(df.count())

3390


In [136]:
droped = df.dropna()

### Count rows after drop null rows

In [137]:
print(droped.count())

2927


### Replace categorical columns with numeric columns

In [138]:
indexer = StringIndexer(inputCols=['sex', 'is_smoking'], outputCols=['sex_cat', 'is_smoking_cat'])
indexed = indexer.fit(droped).transform(droped)
indexed.select(['sex', 'is_smoking', 'sex_cat', 'is_smoking_cat' ]).show(10)

+---+----------+-------+--------------+
|sex|is_smoking|sex_cat|is_smoking_cat|
+---+----------+-------+--------------+
|  M|        NO|    1.0|           0.0|
|  F|       YES|    0.0|           1.0|
|  M|       YES|    1.0|           1.0|
|  F|       YES|    0.0|           1.0|
|  F|        NO|    0.0|           0.0|
|  M|        NO|    1.0|           0.0|
|  M|       YES|    1.0|           1.0|
|  F|        NO|    0.0|           0.0|
|  F|        NO|    0.0|           0.0|
|  M|       YES|    1.0|           1.0|
+---+----------+-------+--------------+
only showing top 10 rows



### Create features Vector

In [139]:
assembler = VectorAssembler(inputCols=['age', 'education', 'cigsPerDay', 'BPMeds', 'prevalentStroke',
                             'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI',
                             'heartRate', 'glucose', 'sex_cat', 'is_smoking_cat'],
                            outputCol="features")
output = assembler.transform(indexed)
final_data = output.select(["features", "TenYearCHD"])
final_data.show(10)

+--------------------+----------+
|            features|TenYearCHD|
+--------------------+----------+
|[36.0,4.0,0.0,0.0...|         0|
|[46.0,1.0,10.0,0....|         0|
|[50.0,1.0,20.0,0....|         1|
|[64.0,1.0,30.0,0....|         0|
|[61.0,3.0,0.0,0.0...|         1|
|[61.0,1.0,0.0,0.0...|         0|
|[36.0,4.0,35.0,0....|         0|
|(15,[0,1,7,8,9,10...|         0|
|(15,[0,1,7,8,9,10...|         0|
|[44.0,1.0,40.0,0....|         0|
+--------------------+----------+
only showing top 10 rows



### Divide the data to train and test data and check distribution

In [140]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

train_data.describe().show()

test_data.describe().show()

+-------+-------------------+
|summary|         TenYearCHD|
+-------+-------------------+
|  count|               2061|
|   mean|0.14798641436196022|
| stddev|  0.355172693958569|
|    min|                  0|
|    max|                  1|
+-------+-------------------+

+-------+------------------+
|summary|        TenYearCHD|
+-------+------------------+
|  count|               866|
|   mean|0.1605080831408776|
| stddev|0.3672887327830585|
|    min|                 0|
|    max|                 1|
+-------+------------------+



### Train data and evaluete test data

In [141]:
disease_ls = LogisticRegression(featuresCol='features', labelCol="TenYearCHD")
disease_dt = DecisionTreeClassifier(featuresCol='features', labelCol="TenYearCHD")
disease_rf = RandomForestClassifier(featuresCol='features', labelCol="TenYearCHD")
disease_gb = GBTClassifier(featuresCol='features', labelCol="TenYearCHD")

trained_disease_model_ls = disease_ls.fit(train_data)
trained_disease_model_dt = disease_dt.fit(train_data)
trained_disease_model_rf = disease_rf.fit(train_data)
trained_disease_model_gb = disease_gb.fit(train_data)

prediction_results_ls = trained_disease_model_ls.transform(test_data)
prediction_results_dt = trained_disease_model_dt.transform(test_data)
prediction_results_rf = trained_disease_model_rf.transform(test_data)
prediction_results_gb = trained_disease_model_gb.transform(test_data)

### Check accuracy (tp+tn)/total

In [155]:
eval_accuracy = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='TenYearCHD',
                                                  metricName="accuracy")

print('Logistic regression:', eval_accuracy.evaluate(prediction_results_ls))

print('Decision tree:', eval_accuracy.evaluate(prediction_results_dt))

print('Random forest:', eval_accuracy.evaluate(prediction_results_rf))

print('Gradient boosting:', eval_accuracy.evaluate(prediction_results_gb))

Logistic regression: 0.8498845265588915
Decision tree: 0.815242494226328
Random forest: 0.8418013856812933
Gradient boosting: 0.8163972286374134


### The most correlation column with TenYearCHD

In [40]:
indexed.select(corr(col1='TenYearCHD', col2='age')).show()

+---------------------+
|corr(TenYearCHD, age)|
+---------------------+
|  0.23442278429915242|
+---------------------+



### Test with real data

In [156]:
real_data = [Row(age=29, education=4.0,cigsPerDay=0.0, BPMeds=0.0, prevalentStroke=0,
                prevalentHyp=1, diabetes=0, totChol=250.0, sysBP=130.0, diaBP=80.0,
                BMI=26.2, heartRate=72.0, glucose=87, sex_cat=1.0, is_smoking_cat=0.0),
             Row(age=64, education=4.0,cigsPerDay=10.0, BPMeds=1.0, prevalentStroke=1,
                prevalentHyp=1, diabetes=1, totChol=300.0, sysBP=146.0, diaBP=90.0,
                BMI=37.0, heartRate=100.0, glucose=190, sex_cat=1.0, is_smoking_cat=1.0),
             ]
real_df = spark.createDataFrame(real_data)
transform_real_data = assembler.transform(real_df)
real_data = transform_real_data.select(["features"])


### Logistic regression:

In [161]:
real_prediction_ls = trained_disease_model_ls.transform(real_data)
real_prediction_ls.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[29.0,4.0,0.0,0.0...|[3.21608907941424...|[0.96143526876676...|       0.0|
|[64.0,4.0,10.0,1....|[-0.9842560892256...|[0.27204809929942...|       1.0|
+--------------------+--------------------+--------------------+----------+



### Decision tree:

In [160]:
real_prediction_dt = trained_disease_model_dt.transform(real_data)
real_prediction_dt.show()

+--------------------+-------------+--------------------+----------+
|            features|rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+----------+
|[29.0,4.0,0.0,0.0...|[1136.0,95.0]|[0.92282696994313...|       0.0|
|[64.0,4.0,10.0,1....|  [36.0,39.0]|         [0.48,0.52]|       1.0|
+--------------------+-------------+--------------------+----------+



### Random forest:

In [162]:
real_prediction_rf = trained_disease_model_rf.transform(real_data)
real_prediction_rf.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[29.0,4.0,0.0,0.0...|[17.9889403633654...|[0.89944701816827...|       0.0|
|[64.0,4.0,10.0,1....|[8.29358174462036...|[0.41467908723101...|       1.0|
+--------------------+--------------------+--------------------+----------+



### Gradient boosting:

In [163]:
real_prediction_gb = trained_disease_model_gb.transform(real_data)
real_prediction_gb.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[29.0,4.0,0.0,0.0...|[1.25570623212982...|[0.92493801351916...|       0.0|
|[64.0,4.0,10.0,1....|[-0.0575222751893...|[0.47127054221076...|       1.0|
+--------------------+--------------------+--------------------+----------+



In [42]:
spark.stop()