In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from helpers.helper_functions import translate_to_file_string
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import pandas as pd
import os
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

In [2]:
inputFile = translate_to_file_string("../data/heart_val.csv")

In [3]:
spark = (SparkSession
       .builder
       .appName("HeartDiseaseAnalRf")
       .getOrCreate())

In [4]:
# load data file.
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

#Pandas df for visualization
dfp = df.toPandas()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: string (nullable = true)

None


In [5]:
#transform labels
labelIndexer = StringIndexer().setInputCol("target").setOutputCol("label").fit(df)
sexIndexer = StringIndexer().setInputCol("sex").setOutputCol("sex_num").fit(df)

In [6]:
#feature cols- die spalten die interessieren, auch gerne entfernen für tests
featureCols = df.columns.copy()
featureCols.remove("target")
featureCols.remove("sex")
featureCols = featureCols + ["sex_num"]

In [7]:
#vektor assembler
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [8]:
#Build feauture Indexer 
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6)

In [9]:
#Convert Indexed labels back to original labels
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

In [10]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features",impurity="gini", \
                 minInstancesPerNode=10, featureSubsetStrategy='sqrt', subsamplingRate=0.95, seed= 12345)

In [11]:
# build a network para grip
paramGrid = (ParamGridBuilder()
             #.addGrid(rf.maxDepth, [2, 5, 10, 20, 30])
               .addGrid(rf.maxDepth, [2, 5, 10])
             #.addGrid(rf.maxBins, [10, 20, 40, 80, 100])
               .addGrid(rf.maxBins, [5, 10, 20])
             #.addGrid(rf.numTrees, [5, 20, 50, 100, 500])
               .addGrid(rf.numTrees, [5, 20, 50])
             .build())

In [12]:
#split data for etsting

splits = df.randomSplit([0.8, 0.2 ], 1234)
train = splits[0]
test = splits[1]

In [13]:
#Pipelining of all steps 
pipeline = Pipeline(stages= [labelIndexer,sexIndexer,  assembler, featureIndexer, rf , predConverter])

In [14]:
#build evaluator 


In [15]:
evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
#evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

In [16]:
#Cross validator
cvRf = CrossValidator(estimator=pipeline, evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=5, parallelism=4)

In [17]:
#train modle
rfModel = cvRf.fit(train)

In [18]:
#Find out the best model
rfBestModel = rfModel.bestModel.stages[4] # the stage at index 1 in the pipeline is the SVMModel
print("Best Params: \n", rfBestModel.explainParams())
print("Param Map: \n", rfBestModel.extractParamMap())
#print(cvSVMModel.getEstimatorParamMaps()[np.argmax(cvSVMModel.avgMetrics)])

Best Params: 
 bootstrap: Whether bootstrap samples are used when building trees. (default: True)
cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/

In [19]:
#test model
predictions = rfModel.transform(test)
predictions.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+-------+--------------------+--------------------+--------------------+--------------------+----------+--------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|label|sex_num|            features|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+-------+--------------------+--------------------+--------------------+--------------------+----------+--------------+
| 34|  f|  1|     118| 210|  0|      1|    192|    0|    0.7|    2|  0|   2|     y|  0.0|    1.0|[34.0,1.0,118.0,2...|[34.0,1.0,118.0,2...|[45.0658827788700...|[0.90131765557740...|       0.0|             y|
| 35|  m|  0|     120| 198|  0|      1|    130|    1|    1.6|    1|  0|   3|     n|  1.0|    0.0|[35.0,0.0,120.0,1...|[35.0,0.0,120.0,1...|[11.0812917554709...|[0.22162

In [20]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.064784053156146


In [21]:
spark.stop()