# Random Forest Classifier for Diabetes Data

**Buckley Dowdle, Latifa Hasan, Luke Moles, Jae Yoon Sung**

### Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('projectModeling') \
    .getOrCreate()

In [3]:
#build evaluators
f1_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
accuracy_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')

In [4]:
#read in data
data = spark.read.parquet("data.parquet")

feats = [
       'Gender',
       'Age',
       'Race_vector',
       'Fam_hist',
       'Smoke_Cigs',
       'BMI',
        'HighBP' 
       ]

data.count()

27297

In [5]:
#check data
data.take(5)

[Row(ParticipantID=94054.0, label=1.0, Gender=0.0, Age=50.0, Race=1.0, Fam_hist=0.0, Smoke_Cigs=3.0, BMI=30.4, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0),
 Row(ParticipantID=94285.0, label=1.0, Gender=0.0, Age=43.0, Race=2.0, Fam_hist=0.0, Smoke_Cigs=1.0, BMI=43.7, HighBP=1.0)]

In [6]:
# test-train split
(trainingData, testData) = data.randomSplit([0.7, 0.3],seed=43)

In [7]:
#one hot encode Race
encoder = OneHotEncoder(inputCol="Race", outputCol="Race_vector")

#make features into vector
assembler = VectorAssembler(inputCols=feats, outputCol='features')

#logistic regression model
lr = LogisticRegression(maxIter=10)

#random forest model
rf = RandomForestClassifier()

#model3
dt = DecisionTreeClassifier() 

#create pipelines
lr_pipeline = Pipeline(stages=[encoder, assembler, lr])

rf_pipeline = Pipeline(stages=[encoder, assembler, rf])

dt_pipeline = Pipeline(stages=[encoder, assembler, dt])




### Logistic Regression

In [8]:
# Set up the parameter grid
lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
lr_crossval = CrossValidator(estimator=lr_pipeline,
                          estimatorParamMaps=lr_paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                          numFolds=10,
                          seed=314,
                          parallelism=7)
#fit lr_model
lr_model = lr_crossval.fit(trainingData)

#make predictions using lr
lr_preds = lr_model.transform(testData)

In [9]:
#test performance
lr_f1 = f1_evaluator.evaluate(lr_preds)
lr_acc = accuracy_evaluator.evaluate(lr_preds)

print('logistic Regression F1 Score: {}'.format(lr_f1))
print('logistic Regression Accuracy: {}'.format(lr_acc))

logistic Regression F1 Score: 0.7656557447426172
logistic Regression Accuracy: 0.806073070408904


In [22]:
y_true = lr_preds.select(['label']).collect()
y_pred = lr_preds.select(['prediction']).collect()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred,zero_division=0))

              precision    recall  f1-score   support

         1.0       0.58      0.26      0.36      1501
         2.0       0.83      0.96      0.89      6519
         3.0       0.00      0.00      0.00       246

    accuracy                           0.81      8266
   macro avg       0.47      0.41      0.41      8266
weighted avg       0.76      0.81      0.77      8266



### Random Forest

In [15]:
# Set up the parameter grid
rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [30,50]) \
    .addGrid(rf.maxDepth, [30]) \
    .build()

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
rf_crossval = CrossValidator(estimator=rf_pipeline,
                          estimatorParamMaps=rf_paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                          numFolds=10,
                          seed=314,
                          parallelism=23)
#fit lr_model
rf_model = rf_crossval.fit(trainingData)

#make predictions using lr
rf_preds = rf_model.transform(testData)

In [16]:
#test performance
rf_f1 = f1_evaluator.evaluate(rf_preds)
rf_acc = accuracy_evaluator.evaluate(rf_preds)

print('Random Forest F1 Score: {}'.format(rf_f1))
print('Random Forest Accuracy: {}'.format(rf_acc))

Random Forest F1 Score: 0.9327214542675151
Random Forest Accuracy: 0.9371386394390454


In [17]:
# examine scores from crossvalidation
rf_model.avgMetrics

[0.922674873244816, 0.9256226772759784]

In [18]:
# optimal number of trees
## https://stackoverflow.com/questions/36697304/how-to-extract-model-hyper-parameters-from-spark-ml-in-pyspark
rf_model.bestModel.stages[-1]._java_obj.parent().getNumTrees()

50

In [19]:
# optimal depth
rf_model.bestModel.stages[-1]._java_obj.parent().getMaxDepth()

30

In [20]:
y_true = rf_preds.select(['label']).collect()
y_pred = rf_preds.select(['prediction']).collect()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         1.0       0.96      0.73      0.83      1465
         2.0       0.93      1.00      0.96      6418
         3.0       0.99      0.56      0.72       246

    accuracy                           0.94      8129
   macro avg       0.96      0.76      0.84      8129
weighted avg       0.94      0.94      0.93      8129



The RF classifier did reasonably well. In the future we will streamline the data pipeline process, evaluate more metrics, and vary RF parameters. We will also collect data from other years. This should be simple to do, and it will be useful since we had to drop so many duplicates and nulls.

### Decision Tree Classifier

In [27]:
# Set up the parameter grid
dt_paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2,10,30]) \
    .build()

# Treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
dt_crossval = CrossValidator(estimator=dt_pipeline,
                          estimatorParamMaps=dt_paramGrid,
                          evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                          numFolds=10,
                          seed=314,
                          parallelism=7)
#fit dt_model
dt_model = dt_crossval.fit(trainingData)

#make predictions using dt
dt_preds = dt_model.transform(testData)

In [28]:
#test performance
dt_f1 = f1_evaluator.evaluate(dt_preds)
dt_acc = accuracy_evaluator.evaluate(dt_preds)

print('Decision Tree F1 Score: {}'.format(dt_f1))
print('Decision Tree Accuracy: {}'.format(dt_acc))

Decision Tree F1 Score: 0.982259502301589
Decision Tree Accuracy: 0.9823372852649407


In [30]:
# examine scores from crossvalidation
dt_model.avgMetrics

[0.689705836666267, 0.8695014276841593, 0.9806770755705196]

In [31]:
dt_model.bestModel.stages[-1].getMaxDepth()

30

In [32]:
y_true = dt_preds.select(['label']).collect()
y_pred = dt_preds.select(['prediction']).collect()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         1.0       0.96      0.95      0.95      1501
         2.0       0.99      0.99      0.99      6519
         3.0       1.00      0.93      0.96       246

    accuracy                           0.98      8266
   macro avg       0.98      0.96      0.97      8266
weighted avg       0.98      0.98      0.98      8266

