# Random Forest Classifier for Diabetes Data

**Buckley Dowdle, Latifa Hasan, Luke Moles, Jae Yoon Sung**

### Setup

In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [8]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('projectModeling') \
    .getOrCreate()

In [9]:
#build evaluators
f1_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
accuracy_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')

In [10]:
#read in data
data = spark.read.parquet("data.parquet")

feats = [
       'Gender',
       'Age',
       'Race_vector',
       'Fam_hist',
       'Smoke_Cigs',
       'BMI',
        'HighBP' 
       ]

data.count()

27297

In [11]:
#check data
data.take(5)

[Row(ParticipantID=94054.0, label=1.0, Gender=0.0, Age=50.0, Race=1.0, Fam_hist=0.0, Smoke_Cigs=3.0, BMI=30.4, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0)]

In [12]:
# test-train split
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [15]:
#one hot encode Race
encoder = OneHotEncoder(inputCol="Race", outputCol="Race_vector")

#make features into vector
assembler = VectorAssembler(inputCols=feats, outputCol='features')

#logistic regression model
lr = LogisticRegression(maxIter=10)

#random forest model
rf = RandomForestClassifier()

#model3
dt = DecisionTreeClassifier() 

#create pipelines
lr_pipeline = Pipeline(stages=[encoder, assembler, lr])

rf_pipeline = Pipeline(stages=[encoder, assembler, rf])

dt_pipeline = Pipeline(stages=[encoder, assembler, dt])




### Logistic Regression

In [21]:
# Set up the parameter grid
lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
lr_crossval = CrossValidator(estimator=lr_pipeline,
                          estimatorParamMaps=lr_paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                          numFolds=10,
                          seed=314,
                          parallelism=7)
#fit lr_model
lr_model = lr_crossval.fit(trainingData)

#make predictions using lr
lr_preds = lr_model.transform(testData)

In [22]:
#test performance
lr_f1 = f1_evaluator.evaluate(lr_preds)
lr_acc = accuracy_evaluator.evaluate(lr_preds)

print('logistic Regression F1 Score: {}'.format(lr_f1))
print('logistic Regression Accuracy: {}'.format(lr_acc))

logistic Regression F1 Score: 0.770537335790141
logistic Regression Accuracy: 0.8082307506637703


### Random Forest

In [10]:
# Set up the parameter grid
rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [2, 5, 8]) \
    .build()

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
rf_crossval = CrossValidator(estimator=rf_pipeline,
                          estimatorParamMaps=rf_paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                          numFolds=10,
                          seed=314,
                          parallelism=7)
#fit lr_model
rf_model = rf_crossval.fit(trainingData)

#make predictions using lr
rf_preds = rf_model.transform(testData)

In [11]:
#test performance
rf_f1 = f1_evaluator.evaluate(rf_preds)
rf_acc = accuracy_evaluator.evaluate(rf_preds)

print('Random Forest F1 Score: {}'.format(rf_f1))
print('Random Forest Accuracy: {}'.format(rf_acc))

Random Forest F1 Score: 0.8023428156174007
Random Forest Accuracy: 0.8369737644905431


In [28]:
# examine scores from crossvalidation
rf_model.avgMetrics

[0.6947228812779438,
 0.7017929902574277,
 0.8071870132536587,
 0.6947228812779438,
 0.7016941614805232,
 0.800392037112011,
 0.6947228812779438,
 0.7042204336624242,
 0.8040674966094667]

In [35]:
# optimal number of trees
## https://stackoverflow.com/questions/36697304/how-to-extract-model-hyper-parameters-from-spark-ml-in-pyspark
rf_model.bestModel.stages[-1]._java_obj.parent().getNumTrees()

10

In [36]:
# optimal depth
rf_model.bestModel.stages[-1]._java_obj.parent().getMaxDepth()

8

The RF classifier did reasonably well. In the future we will streamline the data pipeline process, evaluate more metrics, and vary RF parameters. We will also collect data from other years. This should be simple to do, and it will be useful since we had to drop so many duplicates and nulls.

### Decision Tree Classifier

In [35]:
# Set up the parameter grid
dt_paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2,5,10,20,30,50]) \
    .build()

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
dt_crossval = CrossValidator(estimator=dt_pipeline,
                          estimatorParamMaps=dt_paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                          numFolds=10,
                          seed=314,
                          parallelism=7)
#fit dt_model
dt_model = dt_crossval.fit(trainingData)

#make predictions using dt
dt_preds = dt_model.transform(testData)

In [36]:
#test performance
dt_f1 = f1_evaluator.evaluate(dt_preds)
dt_acc = accuracy_evaluator.evaluate(dt_preds)

print('Decision Tree F1 Score: {}'.format(dt_f1))
print('Decision Tree Accuracy: {}'.format(dt_acc))

Decision Tree F1 Score: 0.7853006349520989
Decision Tree Accuracy: 0.7990237949969493


In [37]:
# examine scores from crossvalidation
dt_model.avgMetrics

[0.7505151069113927,
 0.7505151069113927,
 0.7505151069113927,
 0.7505151069113927,
 0.7505151069113927,
 0.7505151069113927]

In [38]:
dt_model.bestModel.stages[-1].getMaxDepth()

5