In [1]:
import sys
import os

def configure_spark(spark_home=None, pyspark_python=None):
    spark_home = spark_home or "/path/to/default/spark/home"
    os.environ['SPARK_HOME'] = spark_home

    # Add the PySpark directories to the Python path:
    sys.path.insert(1, os.path.join(spark_home, 'python'))
    sys.path.insert(1, os.path.join(spark_home, 'python', 'pyspark'))
    sys.path.insert(1, os.path.join(spark_home, 'python', 'build'))

    # If PySpark isn't specified, use currently running Python binary:
    pyspark_python = pyspark_python or sys.executable
    os.environ['PYSPARK_PYTHON'] = pyspark_python
    
configure_spark('/usr/local/spark', '/home/ubuntu/anaconda3/envs/dat500/bin/python')

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark import SQLContext
from pyspark import SparkContext

SparkContext.setSystemProperty('spark.cleaner.periodicGC.interval', '2')
SparkContext.setSystemProperty('spark.executor.memory', '2400m')
SparkContext.setSystemProperty('spark.driver.cores', '2')
SparkContext.setSystemProperty('spark.driver.memory', '2g')
SparkContext.setSystemProperty("spark.driver.maxResultSize", "2g")

sc = pyspark.SparkContext(master='spark://192.168.11.239:7077', appName='type_predicter')
sqlContext = SQLContext(sc)

In [3]:
from pyspark.sql.types import *
from datetime import datetime
import pyspark.sql.functions as F #avoid conflicts with regular python functions
from pyspark.sql.functions import udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA, StandardScaler
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import time
from pyspark.ml.linalg import Vectors, VectorUDT

In [4]:
df = sqlContext.read.csv("/datasets/crimes_cleaned_engineered.csv", header='true')
df.count()

492842

In [5]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Hour: string (nullable = true)
 |-- DayOfYear: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- District: string (nullable = true)
 |-- y: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- CountHour: string (nullable = true)
 |-- ProbabilityVector: string (nullable = true)



## Convert date types
We can see that Hour, DayOfWeek and Month is labeles as Strings. We want to convert them to integers. The ProbabilityVector is also a string, so we will remove the endings of it by a UDF, splitting and casting it to an vector

In [6]:
df.select("ProbabilityVector").take(1)

[Row(ProbabilityVector='[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]')]

In [7]:
vectorize = udf(lambda l: Vectors.dense(l), VectorUDT())

In [8]:
@udf(StringType())
def renamer(label):
    return label.replace("[","").replace("]","")

In [9]:
df = (df.withColumn('_ProbabilityVector', renamer(F.col('ProbabilityVector')))\
    .drop("ProbabilityVector")\
    .withColumnRenamed("_ProbabilityVector","ProbabilityVector"))

In [10]:
df = (df\
    .withColumn("castedHour", df["Hour"].cast(IntegerType()))\
    .withColumn("castedMonth", df["Month"].cast(IntegerType()))\
    .withColumn("castedDayOfWeek", df["DayOfWeek"].cast(IntegerType()))\
    .withColumn("castedProbabilityVector", F.split(F.col("ProbabilityVector"), ",")\
                .cast(ArrayType(FloatType())))\
    .drop("Hour", "Month", "DayOfWeek","ProbabilityVector")\
    .withColumnRenamed("castedHour","Hour")\
    .withColumnRenamed("castedMonth","Month")\
    .withColumnRenamed("castedDayOfWeek","DayOfWeek")\
    .withColumnRenamed("castedProbabilityVector","ProbabilityVector"))

In [11]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- DayOfYear: string (nullable = true)
 |-- District: string (nullable = true)
 |-- y: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- CountHour: string (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- ProbabilityVector: array (nullable = true)
 |    |-- element: float (containsNull = true)



Cast the array to a dense vector using vectorize

In [12]:
df = (df\
       .withColumn("castedProbabilityVector", vectorize(df["ProbabilityVector"]))\
       .drop("ProbabilityVector")\
       .withColumnRenamed("castedProbabilityVector","ProbabilityVector"))

In [13]:
df.select("ProbabilityVector").show(5)

+--------------------+
|   ProbabilityVector|
+--------------------+
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
|[0.0,0.0,0.0,0.0,...|
+--------------------+
only showing top 5 rows



## Pipelines

In [14]:
categorical_cols = ["District"]

indexers = [ StringIndexer(inputCol=cat_col, outputCol="{}_idx".format(cat_col),
                           handleInvalid = 'skip') for cat_col in categorical_cols] 

target_indexer = [ StringIndexer(inputCol = 'y', outputCol = 'target', handleInvalid = 'skip')]



encoders = [OneHotEncoder(dropLast=True,inputCol=idx.getOutputCol(), 
    outputCol="{}_catVec".format(idx.getOutputCol())) for idx in indexers]


fc = ["Hour","Month","DayOfWeek", "ProbabilityVector"] + [enc.getOutputCol() for enc in encoders]


assembler = VectorAssembler(inputCols= fc , outputCol="Features")

standard_scaler = StandardScaler(inputCol="Features", outputCol="scaledFeatures")

## Example Model
Lets create an example model with the base transformers, and see how it performed

In [168]:
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel

In [16]:
pipeline = Pipeline(stages = indexers + encoders + target_indexer + [assembler])
pipeline_model = pipeline.fit(df)
pipeline_df = pipeline_model.transform(df)
train = pipeline_df.filter(F.col("Year") < 2015)
test = pipeline_df.filter(F.col("Year") >= 2016)

In [17]:
train.count()

400184

The feature vector we pass as input has 76 dimensions, where 52 of them are feature engineered

In [18]:
train.select("Features").take(1)

[Row(Features=SparseVector(76, {0: 16.0, 1: 6.0, 2: 6.0, 70: 1.0}))]

Lets create a model and fit on training data. Then we will use the trained model to predict testing data

In [19]:
dt = DecisionTreeClassifier(featuresCol = "Features",
                             labelCol = 'target',
                             maxDepth = 10,
                             impurity='gini')

dtModel = dt.fit(train)
predictions = dtModel.transform(test)

We can see some examples of how it predicted the test data

In [20]:
predictions.select("Features",'y','target', 'prediction', 'probability').show()

+--------------------+-------+------+----------+--------------------+
|            Features|      y|target|prediction|         probability|
+--------------------+-------+------+----------+--------------------+
|(76,[0,1,2,43,70]...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[1,2,43,64],[...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,43,64]...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,43,64]...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,4,30,4...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,17,30,...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,43,46,...|ASSAULT|   4.0|       0.0|[0.48793407886992...|
|(76,[0,1,2,43,67]...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,43,67]...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,43,55]...|ASSAULT|   4.0|       4.0|[3.64906427566074...|
|(76,[0,1,2,30,43,...|ASSAULT|   4.0|       2.0|[0.0,0.0,0.494071...|
|(76,[0,1,2,43,55]..

With imbalanced data, labels with large share often gets too much attention. We will join the predicted count with the original count of the testing data in order to see whether it forgets smaller labels

In [21]:
pred_dist = predictions.groupBy("prediction").count()
org_dist = test.groupBy("target").count()
pred_dist.join(org_dist,pred_dist.prediction == org_dist.target,how="right").show()

+----------+-----+------+-----+
|prediction|count|target|count|
+----------+-----+------+-----+
|       8.0| 2958|   8.0| 2982|
|       0.0|18426|   0.0|17713|
|       7.0| 2693|   7.0| 2949|
|       1.0|14560|   1.0|13651|
|       4.0| 5279|   4.0| 5562|
|      null| null|  11.0| 1867|
|       3.0| 4040|   3.0| 3677|
|       2.0| 7833|   2.0| 7979|
|      10.0| 2609|  10.0| 2837|
|       6.0| 3187|   6.0| 3439|
|       5.0| 4532|   5.0| 4847|
|       9.0| 8164|   9.0| 5385|
|      null| null|  12.0| 1393|
+----------+-----+------+-----+



Finally, we can run an evaluator that computes the accuracy of the model

In [22]:
evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction",labelCol="target", metricName="accuracy")

evaluator.evaluate(predictions)

0.9079037708162249

MLlibs lack support for good evaluation methods, so we can collect the results and perform classification report with
sklearn if we want

In [23]:
from sklearn.metrics import classification_report

In [24]:
desc_types = np.array(predictions.groupBy("y").count().sort(F.col("count").desc()).select("y").collect())

In [25]:
desc_types = [x.lower() for x in desc_types.flatten()]

In [26]:
y_pred = np.array(predictions.select('prediction').collect())
y_true = np.array(predictions.select('target').collect())

In [27]:
print(classification_report(y_true, y_pred, labels = np.arange(0,len(desc_types)), target_names = desc_types))

                     precision    recall  f1-score   support

              theft       0.94      0.98      0.96     17713
            battery       0.93      0.99      0.96     13651
    criminal damage       0.96      0.94      0.95      7979
            assault       0.90      0.99      0.94      3677
 deceptive practice       0.97      0.92      0.94      5562
      other offense       0.98      0.92      0.95      4847
          narcotics       0.97      0.90      0.93      3439
           burglary       0.98      0.90      0.94      2949
       other crimes       0.94      0.93      0.94      2982
motor vehicle theft       0.59      0.90      0.72      5385
            robbery       0.98      0.90      0.94      2837
  criminal trespass       0.00      0.00      0.00      1867
  weapons violation       0.00      0.00      0.00      1393

           accuracy                           0.91     74281
          macro avg       0.78      0.79      0.78     74281
       weighted avg   

  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
probabilities = [row['probability'] for row in predictions.collect()]
trues = [row['target'] for row in predictions.collect()]
correct_count = 0
for probs, truth in zip(probabilities, trues):
    top_3 = np.argsort(probs)[::-1][:3]
    if truth in top_3:
        correct_count +=1
correct_count/len(trues)

0.9855413901266811

## Parameter tuning
We want to find the best Decision Tree model for our data. We can define a set of parameters to search and evaluate with.

In [98]:
def top_3_err(_probs, _trues):
    correct_count = 0
    for _p, _t in zip(_probs, _trues):
        top_3 = np.argsort(_p).flatten()[::-1][:3]
        if _t in top_3:
            correct_count +=1
    return round((1 - (correct_count/len(_trues))) ,5)

In [46]:
max_depths = [5,10,20,25,30]
impurities = ["gini","entropy"]

#### Base pipelines

In [47]:
pipeline = Pipeline(stages = indexers + encoders + target_indexer + [assembler])
pipeline_model = pipeline.fit(df)
pipeline_df = pipeline_model.transform(df)
train = pipeline_df.filter(F.col("Year") < 2015)
test = pipeline_df.filter(F.col("Year") >= 2016)
evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction",labelCol="target", metricName="accuracy")

In [101]:
logger = {}
logger["base"] = []
for depth in max_depths:
    start = time.time()
    dt = DecisionTreeClassifier(featuresCol = "Features",
                         labelCol = 'target',
                         maxDepth = depth,
                         impurity= 'gini')

    dtModel = dt.fit(train)
    predictions = dtModel.transform(test)


    execution_time = round((time.time() - start),1)
    test_error = round(1 - (evaluator.evaluate(predictions)),4)

    y_true = np.array(predictions.select('target').collect())
    probs =  np.array(predictions.select('probability').collect())
    test_top_3_error = top_3_err(probs,y_true)


    logger["base"].append({
        "depth": depth,
        "imputiry": 'gini',
        "execution_time": execution_time,
        "test_error": test_error,
        "top3_test_error": test_top_3_error
    })

In [103]:
logger

{'base': [{'depth': 5,
   'imputiry': 'gini',
   'execution_time': 28.7,
   'test_error': 0.3037,
   'top3_test_error': 0.20305},
  {'depth': 10,
   'imputiry': 'gini',
   'execution_time': 30.3,
   'test_error': 0.0921,
   'top3_test_error': 0.01446},
  {'depth': 20,
   'imputiry': 'gini',
   'execution_time': 40.2,
   'test_error': 0.0587,
   'top3_test_error': 0.00905},
  {'depth': 25,
   'imputiry': 'gini',
   'execution_time': 57.2,
   'test_error': 0.0589,
   'top3_test_error': 0.00895},
  {'depth': 30,
   'imputiry': 'gini',
   'execution_time': 88.1,
   'test_error': 0.06,
   'top3_test_error': 0.00858}]}

We can see that deeper trees performs best, however 30 depth is less accurate than 20

#### PCA pipelines
This section will perform tests with different dimensions on different depths

In [105]:
max_depths = [10,25,30]
pcas = [5,10,15,25]
logger["pca"] = []

for k in pcas:
    pca = PCA(k = k, inputCol = "scaledFeatures", outputCol="PCA_Features")
    pipeline = Pipeline(stages = indexers + encoders + target_indexer + [assembler] + [standard_scaler] + [pca])
    pipeline_model = pipeline.fit(df)
    pipeline_df = pipeline_model.transform(df)
    train = pipeline_df.filter(F.col("Year") < 2015)
    test = pipeline_df.filter(F.col("Year") >= 2016)
    for depth in max_depths:
        start = time.time()
        
        dt = DecisionTreeClassifier(featuresCol = "PCA_Features",
                             labelCol = 'target',
                             maxDepth = depth,
                             impurity= 'gini')
        
        dtModel = dt.fit(train)
        predictions = dtModel.transform(test)
        execution_time = round((time.time() - start),1)
        test_error = round(1 - (evaluator.evaluate(predictions)),4)
        y_true = np.array(predictions.select('target').collect())
        probs =  np.array(predictions.select('probability').collect())
        test_top_3_error = top_3_err(probs,y_true)
        logger["pca"].append({
            "depth": depth,
            "imputiry": 'gini',
            "execution_time": execution_time,
            "test_error": test_error,
            "top3_test_error": test_top_3_error,
            "dimenstion": k
        })

Store the logged results

In [107]:
import json

In [108]:
with open('logs/base_dt_search.json', 'w+') as f:
    json.dump(logger["base"], f, indent=4)
    
with open('logs/pca_dt_search.json', 'w+') as f:
    json.dump(logger["pca"], f, indent=4)

## Train on best model
Use all available training data and train on the best suited model

In [114]:
top = {}
top_score = 1
for i in logger:
    for v in logger[i]:
        if v["test_error"] < top_score:
            top_score = v["test_error"]
            top = v

In [115]:
top

{'depth': 20,
 'imputiry': 'gini',
 'execution_time': 40.2,
 'test_error': 0.0587,
 'top3_test_error': 0.00905}

In [161]:
pipeline = Pipeline(stages = indexers + encoders + target_indexer + [assembler])
pipeline_model = pipeline.fit(df)
pipeline_df = pipeline_model.transform(df)
dt = DecisionTreeClassifier(featuresCol = "Features",
                     labelCol = 'target',
                     maxDepth = 20,
                     impurity= 'gini')
dtModel = dt.fit(train)

Store the model for later use

In [171]:
path = "/dtModel"
dtModel.write().overwrite().save(path)

How to load and use it for predictions:

In [172]:
sameModel = DecisionTreeClassificationModel.load(path)

In [173]:
preds = sameModel.transform(test)

In [174]:
preds.select("prediction","target").show()

+----------+------+
|prediction|target|
+----------+------+
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       0.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       2.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       1.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
|       4.0|   4.0|
+----------+------+
only showing top 20 rows

