# Tree Method Examples

## Decision Trees and Random Forests

In [None]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 52.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=e2a4c041d1530c1bf1c16d79ea203fbf31c6f7cc987b22cda32f70e47016d088
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [None]:
spark = SparkSession.builder.appName('tree').getOrCreate()

In [None]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [None]:
#Split data into train and test dataset
trainData, testData = data.randomSplit([0.7,0.3])
trainData.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                74|
|   mean|0.5405405405405406|
| stddev|0.5017555260512748|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+



In [None]:
testData.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                 26|
|   mean| 0.6538461538461539|
| stddev|0.48516452403758387|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [None]:
#train randomforest and decision tree model
dt = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features', maxDepth=3)
rf = RandomForestClassifier(labelCol = 'label', featuresCol = 'features', numTrees = 20)

In [None]:
#train model
dtModel = dt.fit(trainData)
rfModel = rf.fit(trainData)

In [None]:
#make prediction
dtPredictions = dtModel.transform(testData)
rfPredictions = rfModel.transform(testData)

In [None]:
dtPredictions.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,148...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[128,129,130...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [None]:
rfPredictions.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,148...|   [17.0,3.0]|[0.85,0.15]|       0.0|
|  0.0|(692,[123,124,125...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[128,129,130...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [None]:
#evaluation
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')

In [None]:
dtAccuracy = evaluator.evaluate(dtPredictions)
rfAccuracy = evaluator.evaluate(rfPredictions)

In [None]:
print("Test Error Decision Trees = %g" % (1.0 - dtAccuracy))
print("Test Error Random Forests = %g" % (1.0 - rfAccuracy))

Test Error Decision Trees = 0
Test Error Random Forests = 0


In [None]:
# Not a very good example to show this!
rfModel.featureImportances

SparseVector(692, {100: 0.0024, 132: 0.002, 154: 0.0046, 180: 0.0039, 183: 0.0012, 215: 0.0074, 230: 0.0111, 238: 0.0019, 260: 0.0074, 263: 0.0426, 272: 0.0446, 273: 0.0288, 290: 0.0401, 295: 0.003, 300: 0.0132, 301: 0.0029, 350: 0.038, 351: 0.0477, 354: 0.0017, 380: 0.0027, 382: 0.0111, 383: 0.0054, 397: 0.0027, 401: 0.0254, 405: 0.047, 406: 0.0536, 426: 0.0019, 427: 0.0047, 434: 0.05, 436: 0.006, 454: 0.0118, 457: 0.012, 462: 0.0032, 471: 0.0023, 482: 0.0246, 483: 0.0314, 490: 0.0949, 512: 0.0427, 517: 0.0417, 523: 0.0023, 539: 0.0759, 540: 0.0188, 550: 0.0131, 577: 0.0357, 578: 0.0329, 596: 0.0351, 628: 0.006, 654: 0.0006})

## Gradient Boosted Trees

In [None]:
from pyspark.ml.classification import GBTClassifier

# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

# Train model.  This also runs the indexers.
model = gbt.fit(trainData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[128,129,130...|
+----------+-----+--------------------+
only showing top 5 rows



In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


# Private and Public Schooll Prediction

We will be using a college dataset to try to classify colleges as Private or Public based off these features:

    Private A factor with levels No and Yes indicating private or public university
    Apps Number of applications received
    Accept Number of applications accepted
    Enroll Number of new students enrolled
    Top10perc Pct. new students from top 10% of H.S. class
    Top25perc Pct. new students from top 25% of H.S. class
    F.Undergrad Number of fulltime undergraduates
    P.Undergrad Number of parttime undergraduates
    Outstate Out-of-state tuition
    Room.Board Room and board costs
    Books Estimated book costs
    Personal Estimated personal spending
    PhD Pct. of faculty with Ph.D.’s
    Terminal Pct. of faculty with terminal degree
    S.F.Ratio Student/faculty ratio
    perc.alumni Pct. alumni who donate
    Expend Instructional expenditure per student
    Grad.Rate Graduation rate

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.builder.appName('treecode').getOrCreate()

#load dataset
data = spark.read.csv('College.csv', inferSchema=True, header=True)

data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [None]:
data.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

## Spark Data Formating 

In [None]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [None]:
assembler = VectorAssembler(
    inputCols = [ 'Apps',
                  'Accept',
                  'Enroll',
                  'Top10perc',
                  'Top25perc',
                  'F_Undergrad',
                  'P_Undergrad',
                  'Outstate',
                  'Room_Board',
                  'Books',
                  'Personal',
                  'PhD',
                  'Terminal',
                  'S_F_Ratio',
                  'perc_alumni',
                  'Expend',
                  'Grad_Rate'],
    outputCol= 'features' )

In [None]:
outputAssembler = assembler.transform(data)

In [None]:
#Indexing target feature
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIdx')
outputFixed = indexer.fit(outputAssembler).transform(outputAssembler)
finalData = outputFixed.select('features', 'PrivateIdx')
finalData.show(5)

+--------------------+----------+
|            features|PrivateIdx|
+--------------------+----------+
|[1660.0,1232.0,72...|       0.0|
|[2186.0,1924.0,51...|       0.0|
|[1428.0,1097.0,33...|       0.0|
|[417.0,349.0,137....|       0.0|
|[193.0,146.0,55.0...|       0.0|
+--------------------+----------+
only showing top 5 rows



In [None]:
#train and test split dataset
trainData, testData = finalData.randomSplit([0.7, 0.3])

In [None]:
#tree classification model
dtc = DecisionTreeClassifier(labelCol='PrivateIdx', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIdx', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIdx', featuresCol='features')

In [None]:
#train model with trainData
dtcModel = dtc.fit(trainData)
rfcModel = rfc.fit(trainData)
gbtModel = gbt.fit(trainData)

## Model Comparison

In [None]:
dtcPredictions = dtcModel.transform(testData)
rfcPredictions = rfcModel.transform(testData)
gbtPredictions = gbtModel.transform(testData)

In [None]:
# Evaluation model
accEvaluator = MulticlassClassificationEvaluator(labelCol='PrivateIdx', predictionCol='prediction', metricName='accuracy')

dtcAccuracy = accEvaluator.evaluate(dtcPredictions)
rfcAccuracy = accEvaluator.evaluate(rfcPredictions)
gbtAccuracy = accEvaluator.evaluate(gbtPredictions)

In [None]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtcAccuracy*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfcAccuracy*100))
print('-'*80)
print('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(gbtAccuracy*100))

Here are the results!
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 91.46%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 94.31%
--------------------------------------------------------------------------------
A ensemble using GBT had an accuracy of: 91.06%
