In [1]:
import findspark
findspark.init('/home/danial/spark-3.3.2-bin-hadoop3')

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier,
                                       GBTClassifier,
                                       DecisionTreeClassifier)

In [6]:
spark = SparkSession.builder.appName('trees').getOrCreate()

In [7]:
path = '/home/danial/Desktop/myspark/Apache-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/'

In [8]:
data = spark.read.format('libsvm').load(path + 'sample_libsvm_data.txt')

23/04/10 14:33:41 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [9]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [11]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [12]:
train_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                77|
|   mean|0.5454545454545454|
| stddev|0.5011947448335864|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+



In [15]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                 23|
|   mean| 0.6521739130434783|
| stddev|0.48698475355767396|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [16]:
data.count()

100

In [18]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [19]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

                                                                                

In [21]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [22]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,148...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [35.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[123,124,125...|   [0.0,41.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|   [0.0,41.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|   [0.0,41.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[125,126,127...|   [0.0,41.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [23]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[122,123,148...|  [82.0,18.0]|[0.82,0.18]|       0.0|
|  0.0|(692,[123,124,125...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[124,125,126...|  [85.0,15.0]|[0.85,0.15]|       0.0|
|  0.0|(692,[124,125,126...|   [95.0,5.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[126,127,128...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[127,128,129...|   [94.0,6.0]|[0.94,0.06]|       0.0|
|  1.0|(692,[123,124,125...|  [0.0,100.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|  [0.0,100.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|  [0.0,100.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[125,126,127...|  [0.0,100.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [25]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,148...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[0.95256924138510...|[0.87047199588301...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[0.91510992413135...|[0.86178791492802...|       0.0|
|  1.0|(692,[123,124,125...|[-1.5435020027249...|[0.04364652142729...|       1.0|
|  1.0|(692,[123

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [27]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [28]:
print ('DTC Accuracy:')
acc_eval.evaluate(dtc_preds)

DTC Accuracy:


1.0

In [29]:
print ('RFC Accuracy:')
acc_eval.evaluate(rfc_preds)

RFC Accuracy:


1.0

In [30]:
print ('GBT Accuracy:')
acc_eval.evaluate(gbt_preds)

GBT Accuracy:


1.0

In [31]:
rfc_model.featureImportances

SparseVector(692, {98: 0.001, 99: 0.0005, 100: 0.0004, 102: 0.0009, 128: 0.0004, 156: 0.0005, 178: 0.0005, 187: 0.0003, 215: 0.0017, 216: 0.0064, 232: 0.0002, 234: 0.0084, 242: 0.0005, 243: 0.0005, 244: 0.0153, 245: 0.0008, 263: 0.009, 264: 0.0018, 268: 0.0017, 270: 0.0004, 271: 0.0061, 272: 0.0041, 275: 0.0017, 289: 0.0158, 290: 0.0022, 291: 0.0085, 296: 0.0008, 300: 0.0066, 301: 0.0169, 314: 0.0019, 316: 0.0031, 318: 0.0033, 321: 0.0004, 322: 0.0067, 323: 0.0005, 327: 0.0018, 328: 0.0152, 329: 0.0057, 345: 0.0165, 346: 0.0063, 347: 0.0005, 350: 0.0005, 351: 0.0369, 352: 0.0008, 355: 0.0005, 356: 0.0203, 357: 0.0014, 369: 0.0004, 370: 0.0003, 373: 0.0078, 377: 0.0241, 378: 0.0307, 379: 0.0094, 380: 0.0005, 381: 0.0014, 382: 0.0016, 383: 0.0003, 384: 0.0018, 386: 0.0011, 387: 0.0031, 400: 0.022, 401: 0.0004, 402: 0.0011, 403: 0.0016, 405: 0.0151, 406: 0.0209, 407: 0.0029, 408: 0.0038, 412: 0.0094, 413: 0.0073, 416: 0.001, 426: 0.002, 427: 0.0027, 428: 0.0007, 429: 0.0108, 431: 0.0006, 

In [32]:
# Code along example - university classifier as either private or public

In [33]:
path_to_college_data = path + 'College.csv'

In [34]:
path_to_college_data

'/home/danial/Desktop/myspark/Apache-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/College.csv'

In [35]:
spark = SparkSession.builder.appName('university').getOrCreate()

23/04/10 14:59:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [36]:
data = spark.read.csv(path_to_college_data, header=True, inferSchema=True)

In [37]:
data.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [38]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [42]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [43]:
data.count()

777

In [44]:
from pyspark.ml.feature import VectorAssembler

In [45]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [46]:
assembler = VectorAssembler(inputCols=['Apps',
                                     'Accept',
                                     'Enroll',
                                     'Top10perc',
                                     'Top25perc',
                                     'F_Undergrad',
                                     'P_Undergrad',
                                     'Outstate',
                                     'Room_Board',
                                     'Books',
                                     'Personal',
                                     'PhD',
                                     'Terminal',
                                     'S_F_Ratio',
                                     'perc_alumni',
                                     'Expend',
                                     'Grad_Rate'], outputCol='features')

In [87]:
output = assembler.transform(data)

In [88]:
from pyspark.ml.feature import StringIndexer

In [89]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [90]:
indexed = indexer.fit(output).transform(output)

In [91]:
indexed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [96]:
final_data = indexed.select(['PrivateIndex', 'features'])

In [97]:
final_data.show()

+------------+--------------------+
|PrivateIndex|            features|
+------------+--------------------+
|         0.0|[1660.0,1232.0,72...|
|         0.0|[2186.0,1924.0,51...|
|         0.0|[1428.0,1097.0,33...|
|         0.0|[417.0,349.0,137....|
|         0.0|[193.0,146.0,55.0...|
|         0.0|[587.0,479.0,158....|
|         0.0|[353.0,340.0,103....|
|         0.0|[1899.0,1720.0,48...|
|         0.0|[1038.0,839.0,227...|
|         0.0|[582.0,498.0,172....|
|         0.0|[1732.0,1425.0,47...|
|         0.0|[2652.0,1900.0,48...|
|         0.0|[1179.0,780.0,290...|
|         0.0|[1267.0,1080.0,38...|
|         0.0|[494.0,313.0,157....|
|         0.0|[1420.0,1093.0,22...|
|         0.0|[4302.0,992.0,418...|
|         0.0|[1216.0,908.0,423...|
|         0.0|[1130.0,704.0,322...|
|         1.0|[3540.0,2001.0,10...|
+------------+--------------------+
only showing top 20 rows



In [62]:
final_data.head(1)

[Row(PrivateIndex=0.0, features=DenseVector([1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0]))]

In [98]:
train_set, test_set = final_data.randomSplit([0.7, 0.3])

In [99]:
from pyspark.ml.classification import (DecisionTreeClassifier, 
                                       RandomForestClassifier, 
                                       GBTClassifier)

In [100]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex')
rfc = RandomForestClassifier(labelCol='PrivateIndex')
gbt = GBTClassifier(labelCol='PrivateIndex')

In [101]:
dtc_model = dtc.fit(train_set)
rfc_model = rfc.fit(train_set)
gbt_model = gbt.fit(train_set)

In [103]:
dtc_preds = dtc_model.transform(test_set)
rfc_preds = rfc_model.transform(test_set)
gbt_preds = gbt_model.transform(test_set)

In [104]:
dtc_preds.show()

+------------+--------------------+-------------+-------------+----------+
|PrivateIndex|            features|rawPrediction|  probability|prediction|
+------------+--------------------+-------------+-------------+----------+
|         0.0|[212.0,197.0,91.0...|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[213.0,166.0,85.0...|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[244.0,198.0,82.0...|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[257.0,183.0,109....|   [42.0,6.0]|[0.875,0.125]|       0.0|
|         0.0|[263.0,223.0,103....|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[292.0,241.0,96.0...|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[335.0,284.0,132....|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[342.0,254.0,126....|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[360.0,329.0,108....|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[367.0,274.0,158....|  [296.0,0.0]|    [1.0,0.0]|       0.0|
|         0.0|[369.0,312.

In [106]:
rfc_preds.show()

+------------+--------------------+--------------------+--------------------+----------+
|PrivateIndex|            features|       rawPrediction|         probability|prediction|
+------------+--------------------+--------------------+--------------------+----------+
|         0.0|[212.0,197.0,91.0...|[19.8350980992367...|[0.99175490496183...|       0.0|
|         0.0|[213.0,166.0,85.0...|[18.8852536862284...|[0.94426268431142...|       0.0|
|         0.0|[244.0,198.0,82.0...|[19.8810849764779...|[0.99405424882389...|       0.0|
|         0.0|[257.0,183.0,109....|[19.2686286861439...|[0.96343143430719...|       0.0|
|         0.0|[263.0,223.0,103....|[18.8852536862284...|[0.94426268431142...|       0.0|
|         0.0|[292.0,241.0,96.0...|[19.8645098639426...|[0.99322549319713...|       0.0|
|         0.0|[335.0,284.0,132....|[19.8350980992367...|[0.99175490496183...|       0.0|
|         0.0|[342.0,254.0,126....|[19.8694118247269...|[0.99347059123634...|       0.0|
|         0.0|[360.0,

In [114]:
gbt_preds.printSchema()

root
 |-- PrivateIndex: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [115]:
gbt_preds.show()

+------------+--------------------+--------------------+--------------------+----------+
|PrivateIndex|            features|       rawPrediction|         probability|prediction|
+------------+--------------------+--------------------+--------------------+----------+
|         0.0|[212.0,197.0,91.0...|[1.54602463768475...|[0.95656359150486...|       0.0|
|         0.0|[213.0,166.0,85.0...|[1.54602463768475...|[0.95656359150486...|       0.0|
|         0.0|[244.0,198.0,82.0...|[1.54241362165213...|[0.95626252693697...|       0.0|
|         0.0|[257.0,183.0,109....|[1.36734034134045...|[0.93904231932323...|       0.0|
|         0.0|[263.0,223.0,103....|[1.56702972404876...|[0.95827600456705...|       0.0|
|         0.0|[292.0,241.0,96.0...|[1.54602463768475...|[0.95656359150486...|       0.0|
|         0.0|[335.0,284.0,132....|[1.54602463768475...|[0.95656359150486...|       0.0|
|         0.0|[342.0,254.0,126....|[1.54602463768475...|[0.95656359150486...|       0.0|
|         0.0|[360.0,

In [74]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [108]:
bin_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [109]:
print ('DTC AUC')
bin_eval.evaluate(dtc_preds)

DTC AUC


0.9175424767910318

In [112]:
print ('RFC AUC')
bin_eval.evaluate(rfc_preds)

RFC AUC


0.9647486424943077

In [113]:
print ('GBT AUC')
bin_eval.evaluate(gbt_preds)

GBT AUC


0.9401383779996496

In [116]:
rfc_new = RandomForestClassifier(labelCol='PrivateIndex', numTrees=150)

In [120]:
rfc_new_model = rfc_new.fit(train_set)

In [121]:
rfc_new_model_preds = rfc_new_model.transform(test_set)

In [122]:
print ('RFC (num of trees = 150) AUC')
bin_eval.evaluate(rfc_new_model_preds)

RFC (num of trees = 150) AUC


0.9655806621124543

In [123]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [124]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [129]:
print ('DTC accuracy: ')
dtc_acc = acc_eval.evaluate(dtc_preds)
dtc_acc

DTC accuracy: 


0.899581589958159

In [130]:
print ('RFC accuracy: ')
rfc_acc = acc_eval.evaluate(rfc_preds)
rfc_acc

RFC accuracy: 


0.9121338912133892

In [131]:
print ('GBT accuracy: ')
gbt_acc = acc_eval.evaluate(gbt_preds)
gbt_acc

GBT accuracy: 


0.9079497907949791