In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import * 
from pyspark.ml import Pipeline 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import StringIndexer 
from pyspark.ml.classification import NaiveBayes 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create Spark Session
spark = SparkSession.builder.master("local[1]").appName("https://mfu.ac.th/").getOrCreate()

# Read data from the vehicle_stolen_dataset.csv 
vehicle_df = spark.read.option("header",True).csv("vehicle_stolen_dataset_New.csv") 
 
vehicle_df.show(5)

25/02/19 13:55:20 WARN Utils: Your hostname, Jennie-Kims-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.27.15.180 instead (on interface en0)
25/02/19 13:55:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/19 13:55:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/19 13:55:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/02/19 13:55:21 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


+------------+------+-----+-----+------+
|number_plate| brand|color| time|stoled|
+------------+------+-----+-----+------+
|        N001|   BMW|black|night|   yes|
|        N002|  Audi|black|night|    no|
|        N003|NISSAN|black|night|   yes|
|        N004|  VEGA|  red|  day|   yes|
|        N005|   BMW| blue|  day|    no|
+------------+------+-----+-----+------+
only showing top 5 rows



In [2]:
# Converting the columns into features
# VectorAssembler is a transformer that combines a given list of columns into a single vector column.
indexers = [
StringIndexer(inputCol="brand", outputCol = "brand_index"),  
StringIndexer(inputCol="color", outputCol = "color_index"),  StringIndexer(inputCol="time", outputCol = "time_index"),  StringIndexer(inputCol="stoled", outputCol = "label")]
pipeline = Pipeline(stages=indexers) 
#Fitting a model to the input dataset. 
indexed_vehicle_df = pipeline.fit(vehicle_df).transform(vehicle_df)
vectorAssembler = VectorAssembler(inputCols = ["brand_index", "color_index", "time_index"], outputCol = "features") 
vindexed_vehicle_df = vectorAssembler.transform(indexed_vehicle_df) 
vindexed_vehicle_df.show(5, False)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
|number_plate|brand |color|time |stoled|brand_index|color_index|time_index|label|features     |
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
|N001        |BMW   |black|night|yes   |1.0        |0.0        |1.0       |0.0  |[1.0,0.0,1.0]|
|N002        |Audi  |black|night|no    |0.0        |0.0        |1.0       |1.0  |[0.0,0.0,1.0]|
|N003        |NISSAN|black|night|yes   |2.0        |0.0        |1.0       |0.0  |[2.0,0.0,1.0]|
|N004        |VEGA  |red  |day  |yes   |3.0        |1.0        |0.0       |0.0  |[3.0,1.0,0.0]|
|N005        |BMW   |blue |day  |no    |1.0        |2.0        |0.0       |1.0  |[1.0,2.0,0.0]|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
only showing top 5 rows



In [3]:
# Splitting for training and testing
splits = vindexed_vehicle_df.randomSplit([0.6,0.4], 42) 
# optional value 42 is seed for sampling 
train_df = splits[0] 
test_df = splits[1]

In [4]:
# Apply the Na√Øve Bayes classifier
nb = NaiveBayes(modelType="multinomial")

In [5]:
# Train the model
nbmodel = nb.fit(train_df)

In [6]:
# Lets predict with test data.
predictions_df = nbmodel.transform(test_df)
predictions_df.show(5, True)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+--------------------+--------------------+----------+
|number_plate| brand|color| time|stoled|brand_index|color_index|time_index|label|     features|       rawPrediction|         probability|prediction|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+--------------------+--------------------+----------+
|        N001|   BMW|black|night|   yes|        1.0|        0.0|       1.0|  0.0|[1.0,0.0,1.0]|[-2.8415815937267...|[0.70850202429149...|       0.0|
|        N003|NISSAN|black|night|   yes|        2.0|        0.0|       1.0|  0.0|[2.0,0.0,1.0]|[-3.5347287742866...|[0.85868498527968...|       0.0|
|        N005|   BMW| blue|  day|    no|        1.0|        2.0|       0.0|  1.0|[1.0,2.0,0.0]|[-3.2470467018348...|[0.80201649862511...|       0.0|
|        N007|  VEGA|  red|night|    no|        3.0|        1.0|       1.0|  1.0|[3.0,1.0,1.0]|[-5.3264882

25/02/19 13:55:37 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [7]:
# Compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") 
nbaccuracy = evaluator.evaluate(predictions_df) 
print("Test accuracy = " + str(nbaccuracy*100) + str("%"))

Test accuracy = 50.0%
