# Decision Trees

## Basics

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier,GBTClassifier,
                              DecisionTreeClassifier)

In [0]:
data = spark.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data.txt')
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [0]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [0]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [0]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [0]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [0]:
print('DTC acc')
print(acc_eval.evaluate(dtc_preds))
print('RFC acc')
print(acc_eval.evaluate(rfc_preds))
print('GBT acc')
print(acc_eval.evaluate(gbt_preds))

DTC acc
0.8823529411764706
RFC acc
1.0
GBT acc
0.8823529411764706


In [0]:
# show importance of feature
rfc_model.featureImportances

Out[16]: SparseVector(692, {100: 0.0008, 128: 0.0013, 149: 0.0006, 178: 0.0017, 179: 0.0014, 189: 0.0008, 207: 0.0067, 208: 0.0059, 209: 0.0006, 210: 0.0005, 216: 0.0006, 233: 0.0008, 235: 0.0017, 236: 0.0015, 237: 0.0007, 243: 0.0022, 244: 0.0078, 262: 0.0093, 263: 0.0003, 271: 0.0012, 272: 0.0111, 273: 0.0085, 286: 0.0006, 290: 0.0068, 292: 0.0016, 293: 0.0004, 295: 0.0007, 296: 0.0007, 301: 0.0163, 319: 0.0015, 322: 0.0006, 323: 0.0094, 324: 0.0015, 327: 0.0037, 329: 0.0082, 351: 0.0202, 352: 0.0019, 356: 0.0152, 357: 0.0093, 373: 0.0033, 375: 0.0026, 377: 0.0083, 378: 0.0087, 379: 0.0448, 380: 0.0013, 381: 0.0009, 382: 0.0007, 384: 0.0079, 385: 0.0108, 386: 0.008, 399: 0.014, 400: 0.0264, 401: 0.0077, 402: 0.0056, 405: 0.0092, 407: 0.0254, 408: 0.0007, 410: 0.0006, 412: 0.0008, 413: 0.0077, 414: 0.0068, 426: 0.0072, 427: 0.0082, 428: 0.0008, 429: 0.0168, 430: 0.0007, 433: 0.0131, 434: 0.0621, 435: 0.0164, 436: 0.0007, 439: 0.0032, 440: 0.0092, 441: 0.009, 454: 0.0092, 455: 0.0091, 

## Code along

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/College.csv', inferSchema= True, header= True)
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [0]:
from pyspark.ml.feature import VectorAssembler
data.columns

Out[4]: ['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [0]:
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad','P_Undergrad','Outstate','Room_Board','Books','Personal','PhD','Terminal','S_F_Ratio','perc_alumni','Expend','Grad_Rate'], outputCol='features')

In [0]:
output = assembler.transform(data)
output.head()

Out[8]: Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60, features=DenseVector([1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0]))

In [0]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')

In [0]:
output_fixed = indexer.fit(output).transform(output)

In [0]:
final_data = output_fixed.select('features', 'PrivateIndex')
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [0]:
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

In [0]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex')
rfc = RandomForestClassifier(labelCol='PrivateIndex')
gbt = GBTClassifier(labelCol='PrivateIndex')

In [0]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [0]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [0]:
print('DTC')
print(my_binary_eval.evaluate(dtc_preds))
print('RFC')
print(my_binary_eval.evaluate(rfc_preds))
print('GBT')
print(my_binary_eval.evaluate(gbt_preds))

DTC
0.9407887448764817
RFC
0.9633322255455852
GBT
0.9385731693807466


In [0]:
dtc_preds.printSchema()
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [0]:
print('DTC acc')
print(acc_eval.evaluate(dtc_preds))
print('RFC acc')
print(acc_eval.evaluate(rfc_preds))
print('GBT acc')
print(acc_eval.evaluate(gbt_preds))

DTC acc
0.9009433962264151
RFC acc
0.9339622641509434
GBT acc
0.9292452830188679
