In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree_models').getOrCreate()

In [2]:
data = spark.read.csv("D:\\Udemy course\\Spark\\MLLib\\College.csv", inferSchema=True, header=True)
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [3]:
#setting up predictors into a vector....
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [4]:
#transforming the data into vector....
output = assembler.transform(data)

In [5]:
#converting the categorical target variable "Private" into numeric....
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private',outputCol='privateindex')
new_df = indexer.fit(output).transform(output)

In [6]:
#final predictors and target variable....
final_data = new_df.select(['features','privateindex'])
final_data.show()

+--------------------+------------+
|            features|privateindex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[587.0,479.0,158....|         0.0|
|[353.0,340.0,103....|         0.0|
|[1899.0,1720.0,48...|         0.0|
|[1038.0,839.0,227...|         0.0|
|[582.0,498.0,172....|         0.0|
|[1732.0,1425.0,47...|         0.0|
|[2652.0,1900.0,48...|         0.0|
|[1179.0,780.0,290...|         0.0|
|[1267.0,1080.0,38...|         0.0|
|[494.0,313.0,157....|         0.0|
|[1420.0,1093.0,22...|         0.0|
|[4302.0,992.0,418...|         0.0|
|[1216.0,908.0,423...|         0.0|
|[1130.0,704.0,322...|         0.0|
|[3540.0,2001.0,10...|         1.0|
+--------------------+------------+
only showing top 20 rows



In [7]:
#split data
train,test = final_data.randomSplit([0.8,0.2])

In [97]:
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.classification import (DecisionTreeClassifier, RandomForestClassifier, GBTClassifier)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Decision tree

####  baseline model

In [84]:
dTree = DecisionTreeClassifier(labelCol='privateindex', featuresCol='features')

# Set F-1 score as evaluation metric for best model selection
evaluator = MulticlassClassificationEvaluator(labelCol='privateindex',
                                              predictionCol='prediction', metricName="f1")    

dtc_model = dTree.fit(train)
dtc_pred = dtc_model.transform(test)

In [87]:
print(evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(dtc))
print("")
import pandas as pd
predictions = dtc_pred.select('privateindex', 'prediction')
pred = predictions.toPandas()
print("Confusion matrix....")
print(pd.crosstab(pred['privateindex'],pred['prediction']))
print("")
print("Classification report.....")
from sklearn.metrics import classification_report
print(classification_report(pred['privateindex'],pred['prediction']))

f1 accuracy: 0.9086654520029194

Confusion matrix....
prediction    0.0  1.0
privateindex          
0.0           114    7
1.0             9   47

Classification report.....
             precision    recall  f1-score   support

        0.0       0.93      0.94      0.93       121
        1.0       0.87      0.84      0.85        56

avg / total       0.91      0.91      0.91       177



In [102]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluation_accu = BinaryClassificationEvaluator(labelCol='privateindex')
print("DecisionTree test accuracy: {0}".format(evaluation_accu.evaluate(dtc_pred)))

DecisionTree test accuracy: 0.9341794569067295


####  DecisionTree tuned model - cross validation

In [93]:
dTree = DecisionTreeClassifier(labelCol='privateindex', featuresCol='features')
pipeline = Pipeline(stages=[ dTree])

# Search through decision tree's parameters for best model - 6 parameters to choose from......
paramGrid = ParamGridBuilder().addGrid(dTree.maxDepth, [2,3,4,5,6,7]).addGrid(dTree.minInstancesPerNode,[1,2,3,4,5]).addGrid(dTree.minInfoGain,[0.0,0.1,0.2,0.3,0.4,0.5]).build()

# Set F-1 score as evaluation metric for best model selection
evaluator = MulticlassClassificationEvaluator(labelCol='privateindex',
                                              predictionCol='prediction', metricName="f1")    

# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

CV_dtc_model = crossval.fit(train)

In [94]:
tree_model = CV_dtc_model.bestModel.stages[-1]
print(tree_model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_432d93d0dc9faa25d9d8) of depth 3 with 15 nodes


In [95]:
dtc_tuned = CV_dtc_model.transform(test)
print(evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(dtc_tuned))
print("")
print("Confusion matrix....")
import pandas as pd
predictions = dtc_tuned.select('privateindex', 'prediction')
pred = predictions.toPandas()
print(pd.crosstab(pred['privateindex'],pred['prediction']))
print("")
print("Classification report.....")
print(classification_report(pred['privateindex'],pred['prediction']))

f1 accuracy: 0.9041805171152231

Confusion matrix....
prediction    0.0  1.0
privateindex          
0.0           112    9
1.0             8   48

Classification report.....
             precision    recall  f1-score   support

        0.0       0.93      0.93      0.93       121
        1.0       0.84      0.86      0.85        56

avg / total       0.90      0.90      0.90       177



In [103]:
print("Tuned DecisionTree test accuracy: {0}".format(evaluation_accu.evaluate(dtc_tuned)))

Tuned DecisionTree test accuracy: 0.9637691853600945


### Random Forest

#### baseline model 

In [98]:
rfc = RandomForestClassifier(labelCol='privateindex', featuresCol='features')

evaluator = MulticlassClassificationEvaluator(labelCol='privateindex',
                                              predictionCol='prediction', metricName="f1")    

rfc_model = rfc.fit(train)
rfc_pred = rfc_model.transform(test)

In [99]:
print(evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(rfc_pred))
print("")
import pandas as pd
predictions = rfc_pred.select('privateindex', 'prediction')
pred = predictions.toPandas()
print("Confusion matrix....")
print(pd.crosstab(pred['privateindex'],pred['prediction']))
print("")
print("Classification report.....")
from sklearn.metrics import classification_report
print(classification_report(pred['privateindex'],pred['prediction']))

f1 accuracy: 0.9311047381288879

Confusion matrix....
prediction    0.0  1.0
privateindex          
0.0           118    3
1.0             9   47

Classification report.....
             precision    recall  f1-score   support

        0.0       0.93      0.98      0.95       121
        1.0       0.94      0.84      0.89        56

avg / total       0.93      0.93      0.93       177



In [104]:
print("RandomForest test accuracy: {0}".format(evaluation_accu.evaluate(rfc_pred)))

RandomForest test accuracy: 0.9818476977567887


#### random forest tuned - cross validation

In [100]:
rfc_tuned = RandomForestClassifier(labelCol='privateindex', featuresCol='features')
pipeline = Pipeline(stages=[rfc_tuned])

paramGrid = ParamGridBuilder().addGrid(rfc_tuned.numTrees, [30,60,90,120,150]).addGrid(rfc_tuned.maxDepth,[2,3,4,5,6,7,8,9]).build()

evaluator = MulticlassClassificationEvaluator(labelCol='privateindex',
                                              predictionCol='prediction', metricName="f1")    

# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

CV_rfc_model = crossval.fit(train)

In [105]:
rfc_test_tuned = CV_rfc_model.transform(test)
print("Tuned RandomForest test accuracy: {0}".format(evaluation_accu.evaluate(rfc_test_tuned)))
print(evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(rfc_test_tuned))
print("")
print("Confusion matrix....")
import pandas as pd
predictions = rfc_test_tuned.select('privateindex', 'prediction')
pred = predictions.toPandas()
print(pd.crosstab(pred['privateindex'],pred['prediction']))
print("")
print("Classification report.....")
print(classification_report(pred['privateindex'],pred['prediction']))

Tuned RandomForest test accuracy: 0.9819952774498228
f1 accuracy: 0.9141319467380143

Confusion matrix....
prediction    0.0  1.0
privateindex          
0.0           116    5
1.0            10   46

Classification report.....
             precision    recall  f1-score   support

        0.0       0.92      0.96      0.94       121
        1.0       0.90      0.82      0.86        56

avg / total       0.91      0.92      0.91       177

