In [39]:
import numpy as np
import pandas as pd
import config
from datetime import datetime

import utm
import h5py

In [2]:
# Import data
crime_data = pd.read_csv('../data/crimedata_allneighbourhoods_allyears.csv')

In [3]:
# Rename columns X and Y to UTM_X and UTM_Y
crime_data.rename(columns={'X':'UTM_E','Y':'UTM_N'},inplace=True)

Some crime instances have location coordinates NaN, suggesting missing location data. These instances have been removed.

In [4]:
# Remove for NaNs among relevant columns
processed_crime_data = crime_data[~crime_data.isna().any(axis=1)]

Some crime types (*Homicide* and *Offense Against a Person*) have instances with location coordinates mentioned as 0 due to privacy concerns. To address this, we have generated random location coordinates within the provided neighbourhood in order to retain these instances.

In [5]:
# For now removing these crime types
processed_crime_data = processed_crime_data[~processed_crime_data['TYPE'].isin(['Homicide','Offence Against a Person'])]


In [6]:
# Convert UTM coordinates to Latitude and Longitude
def utm2latlong(utm_x, utm_y, utm_zone_no, utm_zone_ltr):
    lat, long = utm.to_latlon(utm_x, utm_y, utm_zone_no, utm_zone_ltr)
    return lat,long

In [7]:
# Convert UTM coordinates to latitude and longitude
processed_crime_data['LAT'], processed_crime_data['LONG'] = utm2latlong(processed_crime_data['UTM_E'],
                                                                        processed_crime_data['UTM_N'],
                                                                        config.UTM_ZONE_NO, 
                                                                        config.UTM_ZONE_LTR)

In [8]:
# Function to create bins
def getBins(min_,max_,n_bins):
    bins = np.linspace(start=min_, stop=max_, num=n_bins+1)
    return bins

In [9]:
## Generate latitude and longitude bins

# Number of bins = bounding box length/ length of each cell
n_bins = int(config.BB_DIST/config.BB_CELL_LEN)
    
# Get minimum and maxiumum values for longitudes and latitudes of bounding box vertices
min_lat = min(config.BB_VERTICES.values(), key = lambda x: x['lat'])['lat']
max_lat = max(config.BB_VERTICES.values(), key = lambda x: x['lat'])['lat']

min_long = min(config.BB_VERTICES.values(), key = lambda x: x['long'])['long']
max_long = max(config.BB_VERTICES.values(), key = lambda x: x['long'])['long']

# Divide bounding box into bins
lat_bins = getBins(min_=min_lat, max_=max_lat, n_bins=n_bins)
long_bins = getBins(min_=min_long, max_=max_long, n_bins=n_bins)

In [10]:
# Function to generate cell coordinates
def getCellLocs(lats,longs,lat_bins,long_bins, correction=False):
    
    # Assign x coordinate of cell. X coordinates can range from 1 to lat_bins and are upper bound.
    # Location coordinates laying outside the bounding box are labelled as either 0 or len(lat_bins)
    cell_x = np.digitize(lats,lat_bins,right=True)
    
    # Assign x coordinate of cell. Y coordinates can range from 1 to long_bins and are upper bound.
    # Location coordinates laying outside the bounding box are labelled as either 0 or len(long_bins)
    cell_y = np.digitize(longs,long_bins,right=True)
    
    if correction == True:
        cell_x_corr = [i-1 if i==len(lat_bins) else i+1 if i==0 else i for i in cell_x] 
        cell_y_corr = [i-1 if i==len(long_bins) else i+1 if i==0 else i for i in cell_y]
        
        return cell_x_corr, cell_y_corr
    
    elif correction == False:
        cell_x_excl = [-1 if i==0 or i==len(lat_bins) else i for i in cell_x]
        cell_y_excl = [-1 if i==0 or i==len(long_bins) else i for i in cell_y]
        
        return cell_x_excl, cell_y_excl

In [11]:
## Generate cell coordinates
processed_crime_data['CELL_X'],processed_crime_data['CELL_Y'] = getCellLocs(lats=processed_crime_data['LAT'].values,
                                                                            longs=processed_crime_data['LONG'].values,
                                                                            lat_bins=lat_bins,
                                                                            long_bins=long_bins,
                                                                            correction=True)
# Filter out rows with cell coordinate values as -1 (occur in the case on correction == False)
processed_crime_data = processed_crime_data[(processed_crime_data['CELL_X'] > -1) & 
                                            (processed_crime_data['CELL_Y'] > -1)]

In [12]:
# Function to fetch crime date from day, month and year
def getDate(day, month, year):
    dt = datetime(year, month, day)
    date = dt.date()
    return date

In [13]:
# Fetch crime date
processed_crime_data['DATE'] = processed_crime_data.apply(lambda row: getDate(day=row['DAY'],
                                                                              month=row['MONTH'],
                                                                              year=row['YEAR']),axis=1)

# Order by crime date
processed_crime_data.sort_values(by='DATE', inplace=True)

In [14]:
# Fetch crime broad category
processed_crime_data['CAT'] = processed_crime_data.apply(lambda row: config.TYPE2CAT[row['TYPE']],axis=1)

In [15]:
# Save processed crime data as pickle
processed_crime_data.to_pickle('../data/processed_crime_data.pkl')

In [16]:
# Function to generate all cell coordinate combinations
def getAllCombs(list_1,list_2):
    all_combs = [(x,y) for x in list_1 for y in list_2]
    return all_combs

In [17]:
# Function to create a pivot table
def getPivot(data, values, index, columns, aggfunc, allcombs=False):
    
    # Create a pivot table with cell coordinates as columns and date and crime category as indices
    data_pivot = data.pivot_table(values=values, index=index, columns=columns, aggfunc=aggfunc)
    
    # Flatten the column values
    data_pivot.columns = data_pivot.columns.to_flat_index()
    
    if allcombs == True:
        
        # All possible cell values along a coordinate
        cell_x_all = np.arange(1,n_bins+1,1)
        
        # Generate all cell cooridinate combinations
        cell_all_pairs = getAllCombs(list_1=cell_x_all, list_2=cell_x_all)
        
        # All unique dates
        unique_dates = processed_crime_data['DATE'].unique()
        
        # Generate all date-crime category combinations
        date_cat_all_pairs = getAllCombs(list_1=unique_dates, list_2=config.CRIME_CATS)
        
        # Reindex the pivot table with all cell coordinate combinations as columns and 
        # all date-crime categories as indices
        data_pivot_ri = data_pivot.reindex(date_cat_all_pairs).reindex(columns=cell_all_pairs).fillna(0)
        
        return data_pivot_ri
    
    else:
        return data_pivot

In [18]:
# Create a pivot table with all cell coordinates as columns and all date and crime category as indices
crime_pivot = getPivot(data=processed_crime_data, values='TYPE', index=['DATE','CAT'], 
                       columns=['CELL_X','CELL_Y'], aggfunc='count', allcombs=True)

In [19]:
# Create an array of the pivot table values
crime_arr = crime_pivot.values

In [20]:
# Reshape the array to have 26x26 grids with crime categories number of channels.
crime_arr_reshaped = crime_arr.reshape((-1,len(config.CRIME_CATS),n_bins,n_bins))

In [21]:
# Function to group number of samples to pairs of batch size and collect corresponding target sample
def getFeaturesTargets(data, batch_size):
    features = []
    targets = []
    for i in np.arange(0,data.shape[0]-(batch_size+1)):
        feature_batch = data[i:i+batch_size]
        target = data[i+batch_size+1]
        features.append(feature_batch)
        targets.append(target)
    return features, targets

In [22]:
# Group number of samples to pairs of batch size and collect corresponding target sample
features, targets = getFeaturesTargets(data=crime_arr_reshaped, batch_size=config.BATCH_SIZE)

In [23]:
# Convert features and targets to arrays
features = np.array(features)
targets = np.array(targets)

In [40]:
# Save the features and targets as pickle files
with h5py.File('../data/features.h5', 'w') as hf:
    hf.create_dataset("features",  data=features)
    
with h5py.File('../data/targets.h5', 'w') as hf:
    hf.create_dataset("targets",  data=targets)
   