# Rules
When generating prior probabilites for future dates (1 month), we will

1. Avg all time, that time of the week, that hour (with all available training data)
2. Avg past 365-(future days) days, that day of the week, (+- 1 hour)
3. Avg past 60 days-(future days), that day of the week, (+- 1 hour) (with most recent training data)
4. Avg +- 30 days, last year, that day of the week, (+- 1 hour)

Fetch all time matrix for: each district, dayofweek, hour (most recent one). Calculate based on those matrices.
Start by getting a pre calculated past dataframe

In [None]:
import sys
import os

def configure_spark(spark_home=None, pyspark_python=None):
    spark_home = spark_home or "/path/to/default/spark/home"
    os.environ['SPARK_HOME'] = spark_home

    # Add the PySpark directories to the Python path:
    sys.path.insert(1, os.path.join(spark_home, 'python'))
    sys.path.insert(1, os.path.join(spark_home, 'python', 'pyspark'))
    sys.path.insert(1, os.path.join(spark_home, 'python', 'build'))

    # If PySpark isn't specified, use currently running Python binary:
    pyspark_python = pyspark_python or sys.executable
    os.environ['PYSPARK_PYTHON'] = pyspark_python
    
configure_spark('/usr/local/spark', '/home/ubuntu/anaconda3/envs/dat500/bin/python')


import findspark
findspark.init()
import pyspark
from pyspark import SQLContext
from pyspark import SparkContext

SparkContext.setSystemProperty('spark.cleaner.periodicGC.interval', '2')
SparkContext.setSystemProperty('spark.executor.memory', '2400m')
SparkContext.setSystemProperty('spark.driver.cores', '2')
SparkContext.setSystemProperty('spark.driver.memory', '2g')
SparkContext.setSystemProperty("spark.driver.maxResultSize", "2g")

sc = pyspark.SparkContext(master='spark://192.168.11.239:7077', appName='type_predicter')
sqlContext = SQLContext(sc)

from pyspark.sql.types import *
import pyspark.sql.functions as F #avoid conflicts with regular python functions
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler 
import numpy as np
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors, MatrixUDT, VectorUDT, DenseMatrix, DenseVector
from util import all_time_avg, avg_past_year, avg_past_month, avg_last_year_months
import time
import math
import pandas as pd

tmp = sqlContext.read.csv("/datasets/crimes_cleaned_engineered_new.csv", header='true')\
.select("Day","District", "Year", "DayOfYear", "Hour", "y", "all_time_mat","DayOfWeek").dropDuplicates()

#We do only need the newest records
join_df = tmp\
.groupBy("District", "y", "DayOfWeek", "Hour")\
.agg(F.max("Day"))\
.select(F.col("District").alias("jDistrict"), 
       F.col("max(Day)").alias("jDay"),
       F.col("Hour").alias("jHour"),
       F.col("y").alias("jy"))

tmp = df.join(join_df,\
              ([join_df.jy == tmp.y,\
                join_df.jDistrict == tmp.District,
                join_df.jDay == tmp.Day,
                join_df.jHour == tmp.Hour]),\
              how='right')\
.select("District", "Year", "DayOfYear", "Hour", "y", "all_time_mat", "DayOfWeek").dropDuplicates()

In [3]:
tmp.count()

42997

In [4]:
tmp.show()

+--------+----+---------+----+------------------+--------------------+---------+
|District|Year|DayOfYear|Hour|                 y|        all_time_mat|DayOfWeek|
+--------+----+---------+----+------------------+--------------------+---------+
|     003|2019|      335|  17|   CRIMINAL DAMAGE|162.0   15.0    2...|        1|
|     024|2007|      185|  13| CRIMINAL TRESPASS|185.0   
4.0     ...|        4|
|     006|2017|      212|  22|           ASSAULT|157.0   5000.0  5...|        2|
|     001|2018|      124|  17|           BATTERY|151.0   5000.0  5...|        6|
|     017|2009|      186|   4| WEAPONS VIOLATION|186.0   
1.0     ...|        1|
|     019|2017|      123|  10|DECEPTIVE PRACTICE|22.0    5000.0  5...|        4|
|     009|2013|      334|   4|          BURGLARY|246.0   5000.0  
...|        7|
|     014|2012|      292|   7|           BATTERY|323.0   5000.0  
...|        5|
|     004|2019|      171|   4|      OTHER CRIMES|171.0   
5.0     ...|        5|
|     014|2018|      352|  1

In [2]:
join_df.unpersist()

DataFrame[ID: string, Day: date, Month: int, Hour: int, DayOfYear: int, DayOfWeek: int, District: string, y: string, Year: int, CountHour: bigint]

In [3]:
tmp.cache()

DataFrame[District: string, Year: int, DayOfYear: int, Hour: int, y: string, all_time_mat: matrix, DayOfWeek: int]

In [4]:
crime_types = tmp.select('y').distinct().rdd.map(lambda r: r[0]).collect()
districts = tmp.select('District').distinct().rdd.map(lambda r: r[0]).collect()

In [5]:
def convert_dayofweek(pandas_time):
    mapper = {
        0: 2,
        1: 3,
        2: 4,
        3: 5,
        4: 6,
        5: 7,
        6: 1
    }
    return mapper[pandas_time]

In [6]:
#Pandas datetime dayofweek:  monday (0) - sunday (6)
#Pyspark datetime dayofweek:  sunday (1) - monday (7)
dates = pd.date_range(start='1/1/2020', end='31/01/2020', freq='h')
cols = ["Date","Hour","DayOfYear","DayOfWeek", "District","Year","Month"]
df_list = []
for dist in districts:
    dist_list = []
    doy_list = []
    dow_list = []
    hr_list = []
    yr_list = []
    month_list = []
    for d in dates:
        dist_list.append(dist)
        doy_list.append(d.dayofyear)
        dow_list.append(convert_dayofweek(d.dayofweek))
        hr_list.append(d.hour)
        yr_list.append(2020)
        month_list.append(1)
    
    df_tmp = pd.DataFrame(data = np.array([dates, hr_list, doy_list, dow_list, dist_list, yr_list, month_list]).T,\
                          columns=cols)
    df_list.append(df_tmp)

future_df = pd.concat(df_list,sort=False).reset_index(drop=True)

In [7]:
future_df.head()

Unnamed: 0,Date,Hour,DayOfYear,DayOfWeek,District,Year,Month
0,2020-01-01 00:00:00,0,1,4,9,2020,1
1,2020-01-01 01:00:00,1,1,4,9,2020,1
2,2020-01-01 02:00:00,2,1,4,9,2020,1
3,2020-01-01 03:00:00,3,1,4,9,2020,1
4,2020-01-01 04:00:00,4,1,4,9,2020,1


In [7]:
pd_most_recent = tmp.toPandas()
pd_most_recent.head()

Unnamed: 0,District,Year,DayOfYear,Hour,y,all_time_mat,DayOfWeek
0,18,2019,191,21,ASSAULT,"DenseMatrix([[1.910e+02, 3.380e+02],\n ...",4
1,11,2009,108,12,OTHER CRIMES,"DenseMatrix([[3.260e+02, 3.200e+01, 5.300e+01]...",7
2,24,2007,185,13,CRIMINAL TRESPASS,"DenseMatrix([[1.850e+02],\n [4.000...",4
3,3,2008,177,4,ROBBERY,"DenseMatrix([[2.600e+02, 5.000e+03],\n ...",4
4,17,2009,186,4,WEAPONS VIOLATION,"DenseMatrix([[1.860e+02],\n [1.000...",1


In [9]:
pd_most_recent.DayOfWeek.value_counts()

7    6168
3    6162
6    6157
5    6137
2    6134
4    6124
1    6115
Name: DayOfWeek, dtype: int64

In [8]:
def spark_get_future_probabilities(future_df, past_df):
    start = time.time()
    df_list = []
    for _, r in future_df.iterrows():
        #Find matrices for all crime types
        _tmp_df = past_df[(past_df["Hour"] == r["Hour"]) & (past_df["DayOfWeek"] == r["DayOfWeek"]) & \
              (past_df["District"] == r["District"])][["District","Hour","y","all_time_mat"]].copy()
        _tmp_df["DayOfYear"] = r["DayOfYear"]
        _tmp_df["Year"] = r["Year"] 
        _tmp_df["Day"] = r["Date"]
        _tmp_df["DayOfWeek"] = r["DayOfWeek"]
        _tmp_df["Month"] = r["Month"]
        df_list.append(_tmp_df)
        
    #Concatenate all available combinations and create a pyspark dataframe from it
    _df = sqlContext.createDataFrame(pd.concat(df_list,sort=False).reset_index(drop=True))
    print("Generated spark base dataframe in: ", round((time.time() - start),1), " seconds" )
    #Calculate statistics
    _df = (_df\
    .withColumn("extra", F.lit(60))\
    .withColumn("avg_last_year_months", avg_last_year_months(\
                                               F.col("all_time_mat"),
                                               F.col("Year"),
                                               F.col("DayOfYear")))
    .withColumn("avg_past_month", avg_past_month(\
                                               F.col("all_time_mat"),
                                               F.col("Year"),
                                               F.col("DayOfYear"),
                                               F.col("extra")))
    .withColumn("avg_past_year", avg_past_year(\
                                               F.col("all_time_mat"),
                                               F.col("Year"),
                                               F.col("DayOfYear"),
                                               F.col("extra")))
    .withColumn("all_time_avg", all_time_avg(\
                                               F.col("all_time_mat"),
                                               F.col("Year"),
                                               F.col("DayOfYear"),
                                               F.col("extra")))).drop("extra", "all_time_mat")
    print("Completed spard dataframe in : ", round((time.time() - start),1), " seconds" )
    return _df

In [11]:
future_probabilities = spark_get_future_probabilities(future_df, pd_most_recent) 

Generated spark base dataframe in:  283.5  seconds
Completed spard dataframe in :  283.7  seconds


In [12]:
r = future_probabilities.select("DayOfYear","DayOfWeek","Year","Day","District","Month","DayOfWeek",\
                     "Hour","y","avg_last_year_months","avg_past_month","avg_past_year","all_time_avg").collect()

In [13]:
dist_list =  [row["District"] for row in r]
year_list =  [row["Year"] for row in r]
dayofyear_list =  [row["DayOfYear"] for row in r]
hour_list =  [row["Hour"] for row in r]
day_list =  [row["Day"] for row in r]
month_list =  [row["Month"] for row in r]
dow_list =  [row["DayOfWeek"] for row in r]
y_list =  [row["y"] for row in r]
alym_list = [row["avg_last_year_months"] for row in r]
apm_list = [row["avg_past_month"] for row in r]
apy_list = [row["avg_past_year"] for row in r]
ata_list = [row["all_time_avg"] for row in r]

In [14]:
def spark_generate_stats(dists, ys, yrs, doys, hrs, alyms, apms, apys, atas, crime_types, districts, years=None):
    if not years:
        years = np.arange(2001,2020)
    leap_years = [2004, 2008, 2012, 2016, 2020]
    res = {}
    for d in districts:
        res[d] = {}
        for t in crime_types:
            res[d][t] = {}
            for year in years:
                if year in leap_years:
                    res[d][t][year] = {day:{} for day in range(1,367)}
                else:
                    res[d][t][year] = {day:{} for day in range(1,366)}
                    
                    
    for dist, y, yr, doy, hr, alym, apm, apy, ata in zip(dists, ys, yrs, doys, hrs, alyms, apms, apys, atas):
        res[dist][y][yr][doy][hr] = {}
        res[dist][y][yr][doy][hr]["alym"] = alym
        res[dist][y][yr][doy][hr]["apm"] = apm
        res[dist][y][yr][doy][hr]["apy"] = apy
        res[dist][y][yr][doy][hr]["ata"] = ata    
    return res

In [15]:
future_res = spark_generate_stats(dist_list,y_list,year_list,dayofyear_list,hour_list,alym_list,\
                           apm_list,apy_list,ata_list,crime_types, districts,years=[2020])

In [16]:
def get_probability_vectors(district, year, dayofyear, hour, file):
    stats = ["alym","apm","apy","ata"]
    res_vec = []
    for dist, yr, doy, hr in zip(district, year, dayofyear, hour):
        _vec = []
        for s in stats:
            for y in file[dist]:
                try:
                    val = file[dist][y][yr][doy][hr][s]
                    _vec.append(val)
                except: #No statistics available: append 0
                    _vec.append(0)
        res_vec.append(Vectors.dense(_vec))
    return res_vec

In [17]:
future_probability_vectors = get_probability_vectors(dist_list, year_list, dayofyear_list, hour_list, future_res)

In [30]:
future = sqlContext\
.createDataFrame(zip(y_list, dist_list, hour_list, future_probability_vectors, dow_list, month_list, day_list),
                schema=["y","District", "Hour", "ProbabilityVector", "DayOfWeek","Month","Day"])
future.cache()
print("Length of future", future.count())
future.show(10)

Length of future 184519
+-------------------+--------+----+--------------------+---------+-----+-------------------+
|                  y|District|Hour|   ProbabilityVector|DayOfWeek|Month|                Day|
+-------------------+--------+----+--------------------+---------+-----+-------------------+
|           BURGLARY|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|            ASSAULT|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|       OTHER CRIMES|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|          NARCOTICS|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|            ROBBERY|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|      OTHER OFFENSE|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|MOTOR VEHICLE THEFT|     009|   0|[0.0,0.0,0.0,0.0,...|        4|    1|2020-01-01 00:00:00|
|    CRIMINAL DAMAGE|     009|   0|[0.0,0.0,0.

In [19]:
print("Future events with duplicates:",future.count())

Future events to be predicted: 184519


In [32]:
future = future.drop_duplicates(subset=['District','Day','Hour',])
print("Future events without duplicates:",future.count())

Future events without duplicates: 15862


In [33]:
future.select("ProbabilityVector").take(1)

[Row(ProbabilityVector=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0385, 0.0385, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0032, 0.0032, 0.0064, 0.0021, 0.0353, 0.0053, 0.0, 0.0011, 0.0011, 0.0053, 0.0021, 0.0011, 0.0]))]

# Predict the future
This section will predict all possible outcomes of january 2020, based on the probability vectors we created

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA, StandardScaler

### Pipelines

In [34]:
categorical_cols = ["District"]

indexers = [ StringIndexer(inputCol=cat_col, outputCol="{}_idx".format(cat_col),
                           handleInvalid = 'skip') for cat_col in categorical_cols] 

target_indexer = [ StringIndexer(inputCol = 'y', outputCol = 'target', handleInvalid = 'skip')]



encoders = [OneHotEncoder(dropLast=True,inputCol=idx.getOutputCol(), 
    outputCol="{}_catVec".format(idx.getOutputCol())) for idx in indexers]


fc = ["Hour","Month","DayOfWeek", "ProbabilityVector"] + [enc.getOutputCol() for enc in encoders]


assembler = VectorAssembler(inputCols= fc , outputCol="Features")

In [35]:
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel

In [36]:
pipeline = Pipeline(stages = indexers + encoders + target_indexer + [assembler])
pipeline_model = pipeline.fit(future)
pipeline_df = pipeline_model.transform(future)

In [37]:
model = DecisionTreeClassificationModel.load("/dtModel")

In [38]:
future_predictions = model.transform(pipeline_df)

In [44]:
future_predictions.select("District","Hour","DayOfWeek","prediction","probability","Day").toPandas().to_csv('future_crimes.csv', index=None)