In [1]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=3a5222c5bacf269e3fc5a38b01559d9e13056805547efc6889f869e1bc0d859e
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


# Tree Methods Code Along

In this lecture we will code along with some data and test out 3 different tree methods:

A single decision tree

A random forest

A gradient boosted tree classifier

We will be using a college dataset to try to classify colleges as Private or Public based off these features:

Private A factor with levels No and Yes indicating private or public university

Apps Number of applications received

Accept Number of applications accepted

Enroll Number of new students enrolled

Top10perc Pct. new students from top 10% of H.S. class

Top25perc Pct. new students from top 25% of H.S. class

F.Undergrad Number of fulltime undergraduates

P.Undergrad Number of parttime undergraduates

Outstate Out-of-state tuition

Room.Board Room and board costs

Books Estimated book costs

Personal Estimated personal spending

PhD Pct. of faculty with Ph.D.’s

Terminal Pct. of faculty with terminal degree

S.F.Ratio Student/faculty ratio

perc.alumni Pct. alumni who donate

Expend Instructional expenditure per student

Grad.Rate Graduation rate


In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('tree').getOrCreate()
data=spark.read.csv('/content/sample_data/College.csv',inferSchema=True,header=True)

In [3]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [7]:
assembler=VectorAssembler(inputCols=[ 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'],outputCol='features')

In [8]:
output=assembler.transform(data)

In [9]:
from pyspark.ml.feature import StringIndexer

In [10]:
indexer=StringIndexer(inputCol='Private',outputCol='PrivateIndex')

In [11]:
output_fixed=indexer.fit(output).transform(output)

In [12]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [13]:
final_data=output_fixed.select('features','PrivateIndex')

In [14]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [15]:
from pyspark.ml.classification import (DecisionTreeClassifier,RandomForestClassifier,GBTClassifier)

In [16]:
from pyspark.ml import Pipeline

In [27]:
dtc=DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc=RandomForestClassifier(numTrees=150,labelCol='PrivateIndex',featuresCol='features')
gbt=GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [28]:
dtc_model=dtc.fit(train_data)
rfc_model=rfc.fit(train_data)
gbt_model=gbt.fit(train_data)

In [29]:
dtc_preds=dtc_model.transform(test_data)
rfc_preds=rfc_model.transform(test_data)
gbt_preds=gbt_model.transform(test_data)

In [30]:
print("dtc_preds:")
dtc_preds.show()
print("rfc_preds:")
rfc_preds.show()
print("gbt_preds:")
gbt_preds.show()

dtc_preds:
+--------------------+------------+-------------+-----------+----------+
|            features|PrivateIndex|rawPrediction|probability|prediction|
+--------------------+------------+-------------+-----------+----------+
|[81.0,72.0,51.0,3...|         0.0|   [15.0,0.0]|  [1.0,0.0]|       0.0|
|[100.0,90.0,35.0,...|         0.0|  [286.0,0.0]|  [1.0,0.0]|       0.0|
|[167.0,130.0,46.0...|         0.0|  [286.0,0.0]|  [1.0,0.0]|       0.0|
|[191.0,165.0,63.0...|         0.0|  [286.0,0.0]|  [1.0,0.0]|       0.0|
|[193.0,146.0,55.0...|         0.0|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|[212.0,197.0,91.0...|         0.0|  [286.0,0.0]|  [1.0,0.0]|       0.0|
|[213.0,155.0,75.0...|         0.0|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|[222.0,185.0,91.0...|         0.0|  [286.0,0.0]|  [1.0,0.0]|       0.0|
|[232.0,182.0,99.0...|         0.0|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|[279.0,276.0,126....|         0.0|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|[283.0,201.0,97.0...|         0.0|  [28

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [32]:
my_binary_eval=BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [33]:
print('DTC: {} \nRFC: {} \nGBT: {}'.format(my_binary_eval.evaluate(dtc_preds),
                                           my_binary_eval.evaluate(rfc_preds),
                                           my_binary_eval.evaluate(gbt_preds)))

DTC: 0.9035727646838758 
RFC: 0.9774597830153386 
GBT: 0.913112607557052


In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [36]:
acc_eval=MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                           metricName='accuracy')

In [37]:
dtc_acc=acc_eval.evaluate(dtc_preds)
rfc_acc=acc_eval.evaluate(rfc_preds)
gbt_acc=acc_eval.evaluate(gbt_preds)

In [38]:
print('dtc accuracy: {} \nrfc accuracy: {} \ngbt accuracy: {}'.format(dtc_acc,rfc_acc,gbt_acc))

dtc accuracy: 0.9122807017543859 
rfc accuracy: 0.9254385964912281 
gbt accuracy: 0.9122807017543859
