In [1]:
from IPython.core.display import display, HTML
display(HTML(
    '<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:100% !important; } '
        '.end_space { min-height:0px !important; } '
    '</style>'
))

In [2]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('appConf').setMaster('local')
sc = SparkContext(conf = conf)

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession(sc).builder.appName('app').getOrCreate()

In [5]:
data = spark.read.csv("./Resources/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/College.csv", inferSchema=True, header=True)

In [6]:
for i in data.head(5):
    print(i)
    print('\n')

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)


Row(School='Adelphi University', Private='Yes', Apps=2186, Accept=1924, Enroll=512, Top10perc=16, Top25perc=29, F_Undergrad=2683, P_Undergrad=1227, Outstate=12280, Room_Board=6450, Books=750, Personal=1500, PhD=29, Terminal=30, S_F_Ratio=12.2, perc_alumni=16, Expend=10527, Grad_Rate=56)


Row(School='Adrian College', Private='Yes', Apps=1428, Accept=1097, Enroll=336, Top10perc=22, Top25perc=50, F_Undergrad=1036, P_Undergrad=99, Outstate=11250, Room_Board=3750, Books=400, Personal=1165, PhD=53, Terminal=66, S_F_Ratio=12.9, perc_alumni=30, Expend=8735, Grad_Rate=54)


Row(School='Agnes Scott College', Private='Yes', Apps=417, Accept=349, Enroll=137, Top10perc=60, Top25perc=89, F_Undergrad=510, P_

In [7]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [8]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='Private', outputCol='PirvateIndex')
data = indexer.fit(data).transform(data)

data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- PirvateIndex: double (nullable = false)



In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

In [11]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [13]:
data = assembler.transform(data)

In [14]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- PirvateIndex: double (nullable = false)
 |-- features: vector (nullable = true)



In [16]:
final_data = data.select(['features', 'PirvateIndex'])

In [19]:
final_data.groupBy('PirvateIndex').count().show()

+------------+-----+
|PirvateIndex|count|
+------------+-----+
|         0.0|  565|
|         1.0|  212|
+------------+-----+



In [20]:
train, test = final_data.randomSplit([0.7,0.3])

In [21]:
rfc = RandomForestClassifier(labelCol='PirvateIndex')

In [22]:
rfc_model = rfc.fit(train)

In [23]:
test_res = rfc_model.transform(test)

In [24]:
test_res.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PirvateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[100.0,90.0,35.0,...|         0.0|[18.8449560740786...|[0.94224780370393...|       0.0|
|[141.0,118.0,55.0...|         0.0|[19.7561171323771...|[0.98780585661885...|       0.0|
|[174.0,146.0,88.0...|         0.0|[18.5932368172890...|[0.92966184086445...|       0.0|
|[193.0,146.0,55.0...|         0.0|[17.7220485538982...|[0.88610242769491...|       0.0|
|[202.0,184.0,122....|         0.0|[19.8082425306382...|[0.99041212653191...|       0.0|
|[232.0,182.0,99.0...|         0.0|[15.5476224408377...|[0.77738112204188...|       0.0|
|[232.0,216.0,106....|         0.0|[16.0403567184814...|[0.80201783592407...|       0.0|
|[244.0,198.0,82.0...|         0.0|[18.5883045672113...|[0.92941522836056...|       0.0|
|[247.0,189.0,100....

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
bin_eval = BinaryClassificationEvaluator(labelCol='PirvateIndex')

In [37]:
bin_eval.evaluate(test_res)

0.9724501108647449