In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree_methods_adv').getOrCreate()

In [2]:
data = spark.read.csv('crime.csv',inferSchema=True,header=True)

In [3]:
data.columns

['INCIDENT_NUMBER',
 'OFFENSE_CODE',
 'OFFENSE_CODE_GROUP',
 'OFFENSE_DESCRIPTION',
 'DISTRICT',
 'REPORTING_AREA',
 'YEAR',
 'MONTH',
 'DAY_OF_WEEK',
 'HOUR',
 'UCR_PART',
 'STREET']

In [4]:
my_cols = data.select(['OFFENSE_CODE_GROUP' ,
             'DISTRICT',
             'MONTH',
             'DAY_OF_WEEK',
             'HOUR'])

In [5]:
my_final_data = my_cols.na.drop()

In [6]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [7]:
OFFENSE_indexer = StringIndexer(inputCol='OFFENSE_CODE_GROUP',outputCol='OFFENSEIndex')
OFFENSE_encoder = OneHotEncoder(inputCol='OFFENSEIndex',outputCol='OFFENSEVec')

In [8]:
WEEK_indexer = StringIndexer(inputCol='DAY_OF_WEEK',outputCol='WEEKIndex')
WEEK_encoder = OneHotEncoder(inputCol='WEEKIndex',outputCol='WEEKVec')

In [9]:
DIS_indexer = StringIndexer(inputCol='DISTRICT',outputCol='DISIndex')
DIS_encoder = OneHotEncoder(inputCol='DISIndex',outputCol='DISVec')

In [10]:
assembler = VectorAssembler(inputCols=['OFFENSEVec',
 'WEEKVec',
 'DISVec',
 'HOUR',
 'MONTH'],outputCol='features')

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [12]:
rfc = RandomForestClassifier(labelCol='HOUR',featuresCol='features')

In [13]:
pipeline = Pipeline(stages=[OFFENSE_indexer,WEEK_indexer,DIS_indexer,
                           OFFENSE_encoder,WEEK_encoder,DIS_encoder,
                           assembler,rfc])

In [14]:
train_data, test_data = my_final_data.randomSplit([0.7,0.3])

In [15]:
fit_model = pipeline.fit(train_data)

In [16]:
rfc_predictions = fit_model.transform(test_data)

In [17]:
rfc_predictions.show()

+------------------+--------+-----+-----------+----+------------+---------+--------+---------------+-------------+--------------+--------------------+--------------------+--------------------+----------+
|OFFENSE_CODE_GROUP|DISTRICT|MONTH|DAY_OF_WEEK|HOUR|OFFENSEIndex|WEEKIndex|DISIndex|     OFFENSEVec|      WEEKVec|        DISVec|            features|       rawPrediction|         probability|prediction|
+------------------+--------+-----+-----------+----+------------+---------+--------+---------------+-------------+--------------+--------------------+--------------------+--------------------+----------+
|Aggravated Assault|      A1|    1|     Friday|   9|        13.0|      0.0|     3.0|(63,[13],[1.0])|(6,[0],[1.0])|(11,[3],[1.0])|(82,[13,63,72,80,...|[1.19983643231961...|[0.05999182161598...|      12.0|
|Aggravated Assault|      A1|    1|   Saturday|   0|        13.0|      4.0|     3.0|(63,[13],[1.0])|(6,[4],[1.0])|(11,[3],[1.0])|(82,[13,67,72,81]...|[3.25748377744324...|[0.1628741888

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="HOUR", predictionCol="prediction", metricName="accuracy")

In [20]:
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [21]:
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

A random forest ensemble has an accuracy of: 36.30%
