In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier,
                                      GBTClassifier,
                                      DecisionTreeClassifier)

In [3]:
data = spark.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data.txt')

In [4]:
data.show()

In [5]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [6]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbt = GBTClassifier()

In [7]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [8]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [9]:
gbt_preds.show()

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [11]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [12]:
print('DTC Accuracy:', acc_eval.evaluate(dtc_preds))

In [13]:
print('RFC Accuracy:', acc_eval.evaluate(rfc_preds))

In [14]:
print('GBT Accuracy:', acc_eval.evaluate(gbt_preds))

In [15]:
rfc_model.featureImportances

In [16]:
#Lets do one little more practical
data = spark.read.csv('/FileStore/tables/College.csv', inferSchema=True, header=True)

In [17]:
data.columns

In [18]:
data.head(1)

In [19]:
from pyspark.ml.feature import VectorAssembler

In [20]:
assembler = VectorAssembler(inputCols = ['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [21]:
output = assembler.transform(data)

In [22]:
from pyspark.ml.feature import StringIndexer

In [23]:
indexer = StringIndexer(inputCol = 'Private', outputCol = 'PrivateIndex')

In [24]:
output_indexed = indexer.fit(output).transform(output)

In [25]:
output_indexed.printSchema()

In [26]:
final_data = output_indexed.select('features', 'PrivateIndex')

In [27]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [28]:
from pyspark.ml.regression import RandomForestRegressor

In [29]:
from pyspark.ml import Pipeline

In [30]:
dtc = DecisionTreeClassifier(labelCol = 'PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol = 'PrivateIndex', featuresCol='features')
GBT = GBTClassifier(labelCol = 'PrivateIndex', featuresCol='features')

In [31]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = GBT.fit(train_data)

In [32]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [34]:
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

In [35]:
print('DTC')
print('My binary classification accuracy:', my_binary_eval.evaluate(dtc_preds))

In [36]:
print('RFC')
print('My binary classification accuracy:', my_binary_eval.evaluate(rfc_preds))

In [37]:
print('GBT')
print('My binary classification accuracy:', my_binary_eval.evaluate(gbt_preds))