In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('tree').getOrCreate()

In [3]:
data = spark.read.csv('/FileStore/tables/College.csv', header = True, inferSchema = True)

In [4]:
data.printSchema()

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
data.columns

In [7]:
assembler = VectorAssembler(inputCols=['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'],
                         outputCol = 'features') 

In [8]:
output = assembler.transform(data)

In [9]:
from pyspark.ml.feature import StringIndexer

In [10]:
indexer = StringIndexer(inputCol='Private',
                       outputCol = 'PrivateIndex')

In [11]:
output_final = indexer.fit(output).transform(output)

In [12]:
output_final.printSchema()

In [13]:
final_data = output_final.select('features','PrivateIndex')

In [14]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [15]:
#decision tree, random forest, gradient boost tree
from pyspark.ml.classification import (DecisionTreeClassifier,GBTClassifier,RandomForestClassifier)

In [16]:
from pyspark.ml import Pipeline

In [17]:
dtc = DecisionTreeClassifier(labelCol= 'PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol= 'PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol= 'PrivateIndex', featuresCol='features')                            

In [18]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [19]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
my_eval = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

In [22]:
print('DTC')
my_eval.evaluate(dtc_preds)



In [23]:
print('RFC')
my_eval.evaluate(rfc_preds)

In [24]:
my_eval2 = BinaryClassificationEvaluator(labelCol = 'PrivateIndex',
                                        rawPredictionCol = 'prediction')

In [25]:
print('GBT')
my_eval2.evaluate(gbt_preds)

In [26]:
#accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [27]:
acc_eval = MulticlassClassificationEvaluator(labelCol = 'PrivateIndex',
                                            metricName = 'accuracy')

In [28]:
rfc_acc = acc_eval.evaluate(rfc_preds)

In [29]:
rfc_acc