In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [2]:
vehicle_data = pd.read_csv("data/vehicle_stolen_dataset.csv", header=None)

In [3]:
spark = SparkSession.builder.master("local").getOrCreate()

In [4]:
vehicle_df = spark.createDataFrame(vehicle_data)

In [5]:
vehicle_df.show(5)

+----+------+-----+-----+---+
|   0|     1|    2|    3|  4|
+----+------+-----+-----+---+
|N001|   BMW|black|night|yes|
|N002|  Audi|black|night| no|
|N003|NISSAN|black|night|yes|
|N004|  VEGA|  red|  day|yes|
|N005|   BMW| blue|  day| no|
+----+------+-----+-----+---+


In [6]:
vehicle_df = vehicle_df.select(
    col("0").alias("number_plate"),
    col("1").alias("brand"),
    col("2").alias("color"),
    col("3").alias("time"),
    col("4").alias("stolen"),
)

In [7]:
vehicle_df.show(5)

+------------+------+-----+-----+------+
|number_plate| brand|color| time|stolen|
+------------+------+-----+-----+------+
|        N001|   BMW|black|night|   yes|
|        N002|  Audi|black|night|    no|
|        N003|NISSAN|black|night|   yes|
|        N004|  VEGA|  red|  day|   yes|
|        N005|   BMW| blue|  day|    no|
+------------+------+-----+-----+------+


In [8]:
indexers = [
    StringIndexer(inputCol="brand", outputCol="brand_index"),
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="time", outputCol="time_index"),
    StringIndexer(inputCol="stolen", outputCol="label"),
]

In [9]:
pipeline = Pipeline(stages=indexers)

In [10]:
indexed_vehicle_df = pipeline.fit(vehicle_df).transform(vehicle_df)

In [11]:
indexed_vehicle_df.show(5, False)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+
|number_plate|brand |color|time |stolen|brand_index|color_index|time_index|label|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+
|N001        |BMW   |black|night|yes   |1.0        |0.0        |1.0       |0.0  |
|N002        |Audi  |black|night|no    |0.0        |0.0        |1.0       |1.0  |
|N003        |NISSAN|black|night|yes   |2.0        |0.0        |1.0       |0.0  |
|N004        |VEGA  |red  |day  |yes   |3.0        |1.0        |0.0       |0.0  |
|N005        |BMW   |blue |day  |no    |1.0        |2.0        |0.0       |1.0  |
+------------+------+-----+-----+------+-----------+-----------+----------+-----+


In [12]:
vectorAssembler = VectorAssembler(inputCols=["brand_index",
                                             "color_index",
                                             "time_index"],
                                  outputCol= "features")
vindexed_vehicle_df = vectorAssembler.transform(indexed_vehicle_df)

vindexed_vehicle_df.show(5, False)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
|number_plate|brand |color|time |stolen|brand_index|color_index|time_index|label|features     |
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+
|N001        |BMW   |black|night|yes   |1.0        |0.0        |1.0       |0.0  |[1.0,0.0,1.0]|
|N002        |Audi  |black|night|no    |0.0        |0.0        |1.0       |1.0  |[0.0,0.0,1.0]|
|N003        |NISSAN|black|night|yes   |2.0        |0.0        |1.0       |0.0  |[2.0,0.0,1.0]|
|N004        |VEGA  |red  |day  |yes   |3.0        |1.0        |0.0       |0.0  |[3.0,1.0,0.0]|
|N005        |BMW   |blue |day  |no    |1.0        |2.0        |0.0       |1.0  |[1.0,2.0,0.0]|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+


In [13]:
splits = vindexed_vehicle_df.randomSplit([0.6,0.4], 42)

In [14]:
train_df = splits[0]
test_df = splits[1]

In [15]:
nb = NaiveBayes(modelType="multinomial")

nbmodel = nb.fit(train_df)

In [16]:
predictions_df = nbmodel.transform(test_df)

In [17]:
predictions_df.show(5)

+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+--------------------+--------------------+----------+
|number_plate| brand|color| time|stolen|brand_index|color_index|time_index|label|     features|       rawPrediction|         probability|prediction|
+------------+------+-----+-----+------+-----------+-----------+----------+-----+-------------+--------------------+--------------------+----------+
|        N001|   BMW|black|night|   yes|        1.0|        0.0|       1.0|  0.0|[1.0,0.0,1.0]|[-2.8415815937267...|[0.70850202429149...|       0.0|
|        N003|NISSAN|black|night|   yes|        2.0|        0.0|       1.0|  0.0|[2.0,0.0,1.0]|[-3.5347287742866...|[0.85868498527968...|       0.0|
|        N005|   BMW| blue|  day|    no|        1.0|        2.0|       0.0|  1.0|[1.0,2.0,0.0]|[-3.2470467018348...|[0.80201649862511...|       0.0|
|        N007|  VEGA|  red|night|    no|        3.0|        1.0|       1.0|  1.0|[3.0,1.0,1.0]|[-5.3264882