In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('collegeTree').getOrCreate()

## Download raw data and take a look

In [2]:
data = spark.read.csv('College.csv', inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
data.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

## Create VectorAssembler of the data to feed to the model

In [5]:
from pyspark.ml.feature import VectorAssembler

In [23]:
vecCols = data.columns
# remove columns with type String
for delCol in ['School','Private']:
    vecCols.remove(delCol)
vecCols

['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [55]:
assembler = VectorAssembler(inputCols=vecCols, outputCol='features')
data_assem = assembler.transform(data).select('School','Private','features')

In [56]:
data_assem.show(5)

+--------------------+-------+--------------------+
|              School|Private|            features|
+--------------------+-------+--------------------+
|Abilene Christian...|    Yes|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|[2186.0,1924.0,51...|
|      Adrian College|    Yes|[1428.0,1097.0,33...|
| Agnes Scott College|    Yes|[417.0,349.0,137....|
|Alaska Pacific Un...|    Yes|[193.0,146.0,55.0...|
+--------------------+-------+--------------------+
only showing top 5 rows



## Change the target column 'Private' from String to Int

In [57]:
from pyspark.ml.feature import StringIndexer

In [58]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [59]:
data_assem = indexer.fit(data_assem).transform(data_assem)
data_assem.select('Private','PrivateIndex').distinct().show()

+-------+------------+
|Private|PrivateIndex|
+-------+------------+
|    Yes|         0.0|
|     No|         1.0|
+-------+------------+



In [62]:
# build a dataframe that will be fed to the model
data_feed = data_assem.select('School','PrivateIndex','features')
data_feed.show(5)

+--------------------+------------+--------------------+
|              School|PrivateIndex|            features|
+--------------------+------------+--------------------+
|Abilene Christian...|         0.0|[1660.0,1232.0,72...|
|  Adelphi University|         0.0|[2186.0,1924.0,51...|
|      Adrian College|         0.0|[1428.0,1097.0,33...|
| Agnes Scott College|         0.0|[417.0,349.0,137....|
|Alaska Pacific Un...|         0.0|[193.0,146.0,55.0...|
+--------------------+------------+--------------------+
only showing top 5 rows



In [69]:
train_data, test_data = data_feed.randomSplit([0.7, 0.3])

## Build tree models and fit/transform the data

In [64]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier

In [65]:
from pyspark.ml import Pipeline

In [135]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

In [136]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [137]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

## Let's see how the predictions look like

In [138]:
rfc_preds.printSchema()

root
 |-- School: string (nullable = true)
 |-- PrivateIndex: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [139]:
rfc_preds.show(5)

+--------------------+------------+--------------------+--------------------+--------------------+----------+
|              School|PrivateIndex|            features|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+--------------------+----------+
|      Adrian College|         0.0|[1428.0,1097.0,33...|[19.8769658552242...|[0.99384829276121...|       0.0|
| Agnes Scott College|         0.0|[417.0,349.0,137....|[19.8389253516665...|[0.99194626758332...|       0.0|
|Albertus Magnus C...|         0.0|[353.0,340.0,103....|[19.8769658552242...|[0.99384829276121...|       0.0|
|Alderson-Broaddus...|         0.0|[582.0,498.0,172....|[19.6904101968832...|[0.98452050984416...|       0.0|
|   Allegheny College|         0.0|[2652.0,1900.0,48...|[19.8343536032059...|[0.99171768016029...|       0.0|
+--------------------+------------+--------------------+--------------------+--------------------+----------+
only showi

# Evaluation

In [140]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [141]:
binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [142]:
print('DTC prediction AUC:', binary_eval.evaluate(dtc_preds))
print('RFC prediction AUC:', binary_eval.evaluate(rfc_preds))
print('GBT prediction AUC:', binary_eval.evaluate(gbt_preds))

DTC prediction AUC: 0.9339631336405531
RFC prediction AUC: 0.9789861751152075
GBT prediction AUC: 0.9491705069124432


## Note that default settings were used, more tweaking can be done eapecially with RFC, GBTC

## Let's try modifying RFC

In [143]:
rfc_2 = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features', numTrees=200)

In [144]:
rfc_preds_2 = gbt_2.fit(train_data).transform(test_data)

In [145]:
print('RFC_2 prediction AUC:', binary_eval.evaluate(rfc_preds_2))

RFC_2 prediction AUC: 0.9818433179723502


## Try MulticlassClassificationEvaluator (has more options e.g. accuracy, precision, recall)

In [146]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [150]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [153]:
print('DTC prediction accuracy:', acc_eval.evaluate(dtc_preds))
print('RFC prediction accuracy:', acc_eval.evaluate(rfc_preds))
print('GBT prediction accuracy:', acc_eval.evaluate(gbt_preds))
print('-'*50)
print('RFC_2 prediction accuracy:', multi_eval.evaluate(rfc_preds_2))

DTC prediction accuracy: 0.9282700421940928
RFC prediction accuracy: 0.9409282700421941
GBT prediction accuracy: 0.9029535864978903
--------------------------------------------------
RFC_2 prediction accuracy: 0.9440445657869335
