In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree').getOrCreate()

In [18]:
from pyspark.ml import Pipeline

In [3]:
from pyspark.ml.classification import (RandomForestClassifier,
                                       GBTClassifier,
                                       DecisionTreeClassifier)

In [5]:
# Get Data
data = spark.read.csv('College.csv',
                      header=True,inferSchema=True)

In [6]:
data.printSchema() # Predict if school is private or public

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [11]:
data.head(1) # Notice label is a string!

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [9]:
# Assembler to prepare data for ML
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll',
                                       'Top10perc','Top25perc',
                                       'F_Undergrad',
                                       'P_Undergrad','Outstate',
                                       'Room_Board','Books',
                                       'Personal','PhD',
                                       'Terminal','S_F_Ratio',
                                       'perc_alumni','Expend',
                                       'Grad_Rate'],
                            outputCol='features')

In [10]:
output = assembler.transform(data)

In [12]:
# Transform label into a category
from pyspark.ml.feature import StringIndexer

In [13]:
indexer = StringIndexer(inputCol='Private',
                        outputCol='PrivateIndex')

In [14]:
outputFixed = indexer.fit(output).transform(output)

In [15]:
outputFixed.printSchema() # features + PrivateIndex

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [16]:
# Now we have data ready for ML
finalData = outputFixed.select('features','PrivateIndex')

In [17]:
train_data,test_data = finalData.randomSplit([0.7,0.3])

In [19]:
# Initiate Models
dtc = DecisionTreeClassifier(featuresCol='features',
                             labelCol='PrivateIndex')
rfc = RandomForestClassifier(featuresCol='features',
                             labelCol='PrivateIndex')
gbt = GBTClassifier(featuresCol='features',
                    labelCol='PrivateIndex')

In [20]:
# Fit the models
dtcModel = dtc.fit(train_data)
rfcModel = rfc.fit(train_data)
gbtModel = gbt.fit(train_data)

In [21]:
# Get results on test data
dtc_preds = dtcModel.transform(test_data)
rfc_preds = rfcModel.transform(test_data)
gbt_preds = gbtModel.transform(test_data)

In [24]:
# EVALUATE YOUR MODELS
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [25]:
# Initiate evaluator
binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [26]:
print('DTC ACCURACY')
binary_eval.evaluate(dtc_preds)

DTC ACCURACY


0.9361265260821309

In [27]:
print('RFC ACCURACY')
binary_eval.evaluate(rfc_preds)

RFC ACCURACY


0.9886792452830189

In [28]:
print('GBT ACCURACY')
binary_eval.evaluate(gbt_preds)

GBT ACCURACY


0.9673140954495006

In [30]:
# Test with multi class. You will notice it drops!
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                             metricName='accuracy')

In [33]:
print('DTC ACCURACY')
acc_eval.evaluate(dtc_preds)

DTC ACCURACY


0.9013452914798207

In [34]:
print('RFC ACCURACY')
acc_eval.evaluate(rfc_preds)

RFC ACCURACY


0.9506726457399103

In [35]:
print('GBT ACCURACY')
acc_eval.evaluate(gbt_preds)

GBT ACCURACY


0.905829596412556