# Random Forest Classifier for Diabetes Data

**Buckley Dowdle, Latifa Hasan, Luke Moles, Jae Sung**

In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder \
    .master('local') \
    .appName('projectModeling') \
    .getOrCreate()

In [3]:
# load in data
df = spark.read.option('header', True) \
    .csv('2017_data.csv', inferSchema=True)

In [4]:
# examine schema
df.printSchema()

root
 |-- ParticipantID: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- Race: integer (nullable = true)
 |-- Education_Level: string (nullable = true)
 |-- Household_income: string (nullable = true)
 |-- Pulse: string (nullable = true)
 |-- SysBP: string (nullable = true)
 |-- DiasBP: string (nullable = true)
 |-- Energy: string (nullable = true)
 |-- Protein: string (nullable = true)
 |-- Carbohydrates: string (nullable = true)
 |-- Total_sugar: string (nullable = true)
 |-- Fiber: string (nullable = true)
 |-- Total_fat: string (nullable = true)
 |-- Sat_fat: string (nullable = true)
 |-- Monounsat_fat: string (nullable = true)
 |-- Polyunsat_fat: string (nullable = true)
 |-- cholesterol: string (nullable = true)
 |-- Alcohol: string (nullable = true)
 |-- Weight(kg): string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- Waist_Circum: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- Glucose: str

In [5]:
# check counts
df.count()

114297

In [6]:
# we need to fix a lot of these to be ints or floats
df.dtypes

[('ParticipantID', 'int'),
 ('Gender', 'int'),
 ('Age', 'double'),
 ('Race', 'int'),
 ('Education_Level', 'string'),
 ('Household_income', 'string'),
 ('Pulse', 'string'),
 ('SysBP', 'string'),
 ('DiasBP', 'string'),
 ('Energy', 'string'),
 ('Protein', 'string'),
 ('Carbohydrates', 'string'),
 ('Total_sugar', 'string'),
 ('Fiber', 'string'),
 ('Total_fat', 'string'),
 ('Sat_fat', 'string'),
 ('Monounsat_fat', 'string'),
 ('Polyunsat_fat', 'string'),
 ('cholesterol', 'string'),
 ('Alcohol', 'string'),
 ('Weight(kg)', 'string'),
 ('BMI', 'string'),
 ('Waist_Circum', 'string'),
 ('Insulin', 'string'),
 ('Glucose', 'string'),
 ('Avg_Drinks', 'string'),
 ('4-5_Drinks', 'string'),
 ('8+Drinks', 'string'),
 ('12+Drinks', 'string'),
 ('4-5DrinksDaily', 'string'),
 ('HighBP', 'string'),
 ('HighChol', 'string'),
 ('VigWork', 'string'),
 ('ModWork', 'string'),
 ('Walk_bike', 'string'),
 ('VigActivity', 'string'),
 ('ModActivity', 'string'),
 ('100Cigs', 'string'),
 ('Smoke_Cigs', 'string'),
 ('Sm

In [32]:
# all columns we want
cols = ['Diagnosis',
        'ParticipantID',
       'Gender',
       'Age',
       'Race',
       'Fam_hist',
       'Smoke_Cigs',
       'Glucose',
       'BMI',
       'SysBP',
       'DiasBP',
       'Avg_Drinks']
data = df.select(cols)

# the ones that need to be floats
str_to_float = ['Glucose',
               'BMI',
               'SysBP',
               'DiasBP',
               'Avg_Drinks']
# convert to floats
for col in str_to_float:
    data = data.withColumn(col, data[col].cast('float'))
    
# set gender to 0/1 instead of 1/2
data = data.withColumn('Gender', data.Gender - 1)
# make diagnosis int
data = data.withColumn('Diagnosis', data.Diagnosis.cast('int'))

# the columns we need to one hot encode
cats = ['Fam_hist',
       'Smoke_Cigs',
       'Race']

# one hot encode
# will streamline much of this into pipeline later
for col in cats:
    indexer = StringIndexer(inputCol=col,
                           outputCol=col + '_id')
    model = indexer.fit(data)
    indexed = model.transform(data)
    
    encoder = OneHotEncoder(inputCol=col+'_id',
                           outputCol=col+'_vec')
    data = encoder.transform(indexed)

# final columns in our df that we need
cols = ['ParticipantID',
        'Diagnosis',
       'Gender',
       'Age',
       'Race_vec',
       'Fam_hist_vec',
       'Smoke_Cigs_vec',
       'Glucose',
       'BMI',
       'SysBP',
       'DiasBP',
       'Avg_Drinks']
data = data.select(cols)

In [33]:
# check types again
data.dtypes

[('ParticipantID', 'int'),
 ('Diagnosis', 'int'),
 ('Gender', 'int'),
 ('Age', 'double'),
 ('Race_vec', 'vector'),
 ('Fam_hist_vec', 'vector'),
 ('Smoke_Cigs_vec', 'vector'),
 ('Glucose', 'float'),
 ('BMI', 'float'),
 ('SysBP', 'float'),
 ('DiasBP', 'float'),
 ('Avg_Drinks', 'float')]

In [34]:
# check results after dropping nulls
data.dropna().dropDuplicates().show(10)

+-------------+---------+------+----+-------------+-------------+--------------+-------+----+-----+------+----------+
|ParticipantID|Diagnosis|Gender| Age|     Race_vec| Fam_hist_vec|Smoke_Cigs_vec|Glucose| BMI|SysBP|DiasBP|Avg_Drinks|
+-------------+---------+------+----+-------------+-------------+--------------+-------+----+-----+------+----------+
|        95771|        2|     1|63.0|(5,[1],[1.0])|(2,[1],[1.0])| (3,[1],[1.0])|   7.94|38.6|138.0|  88.0|       2.0|
|        96852|        2|     0|35.0|(5,[2],[1.0])|(2,[0],[1.0])| (3,[1],[1.0])|   5.77|37.5|120.0|  64.0|       1.0|
|        97450|        2|     1|27.0|(5,[1],[1.0])|(2,[0],[1.0])| (3,[0],[1.0])|   5.44|43.7| 98.0|  58.0|       2.0|
|        98530|        1|     0|60.0|(5,[3],[1.0])|(2,[0],[1.0])| (3,[0],[1.0])|   6.66|33.0|124.0|  68.0|       3.0|
|        99989|        2|     1|21.0|(5,[2],[1.0])|(2,[0],[1.0])| (3,[0],[1.0])|   6.22|32.3|102.0|  54.0|       6.0|
|       101355|        2|     0|46.0|(5,[1],[1.0])|(2,[0

In [35]:
data.dropna().dropDuplicates().count()

1526

In [36]:
# remove duplicates, nulls, and id
data = data.drop().dropDuplicates().select(cols[1:])

In [38]:
# assemble into feature vectors
assembler = VectorAssembler(inputCols=cols[2:], outputCol='features')
assembled = assembler.setHandleInvalid('skip') \
    .transform(data) \
    .select(['Diagnosis','features'])

In [39]:
assembled.show(5)

+---------+--------------------+
|Diagnosis|            features|
+---------+--------------------+
|        2|(17,[0,1,3,8,10,1...|
|        2|(17,[1,4,7,10,12,...|
|        2|(17,[0,1,3,7,9,12...|
|        1|(17,[1,5,7,9,12,1...|
|        2|(17,[0,1,4,7,9,12...|
+---------+--------------------+
only showing top 5 rows



In [40]:
# test-train split
(trainingData, testData) = assembled.randomSplit([0.7, 0.3])

In [49]:
# build random forest classifier
rf = RandomForestClassifier(labelCol='Diagnosis', featuresCol='features', numTrees=10)
model = rf.fit(trainingData)

In [52]:
# make predictions
preds = model.transform(testData)

In [53]:
preds.show(5)

+---------+--------------------+--------------------+--------------------+----------+
|Diagnosis|            features|       rawPrediction|         probability|prediction|
+---------+--------------------+--------------------+--------------------+----------+
|        2|(17,[0,1,3,7,9,12...|[0.0,0.2202305134...|[0.0,0.0220230513...|       2.0|
|        2|(17,[1,3,7,9,12,1...|[0.0,0.5219364556...|[0.0,0.0521936455...|       2.0|
|        2|(17,[1,3,7,10,12,...|[0.0,0.7666002659...|[0.0,0.0766600265...|       2.0|
|        2|(17,[0,1,2,7,9,12...|[0.0,0.4354191187...|[0.0,0.0435419118...|       2.0|
|        2|(17,[1,4,7,9,12,1...|[0.0,0.3941184561...|[0.0,0.0394118456...|       2.0|
+---------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [60]:
# evaluate for accuracy
preds.select(['Diagnosis','prediction']) \
    .rdd \
    .map(lambda x: x[0] == x[1]) \
    .sum() / preds.count()

0.8978723404255319

The RF classifier did reasonably well. In the future we will streamline the data pipeline process, evaluate more metrics, and vary RF parameters. We will also collect data from other years. This should be simple to do, and it will be useful since we had to drop so many duplicates and nulls.

In [61]:
!jupyter nbconvert --to pdf `pwd`/*.ipynb

[NbConvertApp] Converting notebook /sfs/qumulo/qhome/lmm8fb/ds5559/proj/InitialModeling.ipynb to pdf
[NbConvertApp] Writing 42843 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 48252 bytes to /sfs/qumulo/qhome/lmm8fb/ds5559/proj/InitialModeling.pdf
[NbConvertApp] Converting notebook /sfs/qumulo/qhome/lmm8fb/ds5559/proj/eda.ipynb to pdf
[NbConvertApp] Writing 45588 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 36899 bytes to /sfs/qumulo/qhome/lmm8fb/ds5559/proj/eda.pdf
