In [1]:
import numpy as np
import pandas as pd
import config
import utils
import h5py
import geopandas as gpd
import osmnx

pd.options.mode.chained_assignment = None

In [2]:
# Import data
crime_data = pd.read_csv(f'{config.VAN_DATA_RAW}/crimedata_allneighbourhoods_allyears.csv')

In [3]:
# Rename columns X and Y to UTM_X and UTM_Y
crime_data.rename(columns={'X':'UTM_E','Y':'UTM_N'},inplace=True)

Some crime instances have location coordinates NaN, suggesting missing location data. These instances have been removed.

In [4]:
# Remove for NaNs among relevant columns
processed_crime_data = crime_data[~crime_data.isna().any(axis=1)]

Some crime types (*Homicide* and *Offense Against a Person*) have instances with location coordinates mentioned as 0 due to privacy concerns. To address this, we have generated random location coordinates within the provided neighbourhood in order to retain these instances.

We first mask these offset instances, convert the UTM coordinates to latitude and longitude for the rest, and generate random neighbourhood-bound coordinates for the offset

In [5]:
# Apply a mask for the offset values
offset_mask = (processed_crime_data['UTM_E']==0) & (processed_crime_data['UTM_N']==0)

# Create latitude and longitude columns with zeros
processed_crime_data['LAT'], processed_crime_data['LONG']= 0, 0

# Convert UTM coordinates to latitude and longitude for unmasked rows
longs, lats = utils.utm2latlong(processed_crime_data.loc[~offset_mask,'UTM_E'],
                                processed_crime_data.loc[~offset_mask, 'UTM_N'],
                                config.UTM_ZONE_NO, 
                                config.UTM_ZONE_LTR)

processed_crime_data.loc[~offset_mask,'LAT'] = lats
processed_crime_data.loc[~offset_mask,'LONG'] = longs

In [6]:
# Read shapefile with neighbourhood boundaries
geodata = gpd.read_file(f'{config.VAN_DATA_SHP}/local-area-boundary.shp')

The Vancouver Neighbourhood Boundaries shapefile does not include boundary for Stanley Park. This was added to the dataset by making use of *osmnx* library. \
Also, the crime instances dataset breaks down a single neighbourhood (Dunbar-Southlands) to two individual neighbourhoods (Dunbar-Southlands and Mausqueam). This was unified back to a single neighbourhood.

In [7]:
# Add Stanley Park shapefile to the geopandas data
add_ngbh = 'Stanley Park'
ngbh_gdf = osmnx.geocode_to_gdf(f'{add_ngbh}, Vancouver, Canada')
add_shp = ngbh_gdf['geometry'].values[0]

add_shp_short = 'SP'
geodata.loc[len(geodata)]=[add_shp_short, add_ngbh, add_shp]

# Breaking down unified Dunbar-Southlands into two neighbourhoods
processed_crime_data.loc[processed_crime_data['NEIGHBOURHOOD']=='Musqueam','NEIGHBOURHOOD'] = 'Dunbar-Southlands'

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


A few inconsistancies in neighbourhood naming in the two datastes were fixed.

In [8]:
geodata.loc[geodata['name']=='Downtown','name'] = 'Central Business District' 
geodata.loc[geodata['name']=='Arbutus-Ridge','name'] = 'Arbutus Ridge'

In [9]:
# Generate random location coordinates within neighbourhood boundaries for masked rows
processed_crime_data.loc[offset_mask,'LAT'], processed_crime_data.loc[offset_mask,'LONG'] \
= zip(*processed_crime_data.loc[offset_mask,'NEIGHBOURHOOD'].apply(lambda ngbh: utils.generateRandomCoords(ngbh, geodata)))

West End
Reattempting to retrieve random lat/long coordinate!
Stanley Park
Reattempting to retrieve random lat/long coordinate!
Central Business District
Reattempting to retrieve random lat/long coordinate!
Central Business District
Reattempting to retrieve random lat/long coordinate!
Central Business District
Reattempting to retrieve random lat/long coordinate!
Central Business District
Reattempting to retrieve random lat/long coordinate!
Central Business District
Reattempting to retrieve random lat/long coordinate!
Central Business District
Reattempting to retrieve random lat/long coordinate!


In [10]:
## Generate latitude and longitude bins

# Number of bins = bounding box length/ length of each cell
n_bins = int(config.BB_DIST/config.BB_CELL_LEN)
    
# Get minimum and maxiumum values for longitudes and latitudes of bounding box vertices
min_lat = min(config.BB_VERTICES.values(), key = lambda x: x['lat'])['lat']
max_lat = max(config.BB_VERTICES.values(), key = lambda x: x['lat'])['lat']

min_long = min(config.BB_VERTICES.values(), key = lambda x: x['long'])['long']
max_long = max(config.BB_VERTICES.values(), key = lambda x: x['long'])['long']

# Divide bounding box into bins
lat_bins = utils.getBins(min_=min_lat, max_=max_lat, n_bins=n_bins)
long_bins = utils.getBins(min_=min_long, max_=max_long, n_bins=n_bins)

In [11]:
## Generate cell coordinates
processed_crime_data['CELL_X'],processed_crime_data['CELL_Y'] = utils.getCellLocs(lats=processed_crime_data['LAT'].values,
                                                                                  longs=processed_crime_data['LONG'].values,
                                                                                  lat_bins=lat_bins,
                                                                                  long_bins=long_bins,
                                                                                  correction=True)

# Filter out rows with cell coordinate values as -1 (occur in the case on correction == False)
processed_crime_data = processed_crime_data[(processed_crime_data['CELL_X'] > -1) & 
                                            (processed_crime_data['CELL_Y'] > -1)]

In [12]:
# Fetch crime date
processed_crime_data['DATE'] = processed_crime_data.apply(lambda row: utils.getDate(day=row['DAY'],
                                                                                    month=row['MONTH'],
                                                                                    year=row['YEAR']),axis=1)

# Order by crime date
processed_crime_data.sort_values(by='DATE', inplace=True)

In [13]:
# Fetch crime broad category
processed_crime_data['CAT'] = processed_crime_data.apply(lambda row: config.TYPE2CAT[row['TYPE']],axis=1)

In [14]:
# Save processed crime data as pickle
processed_crime_data.to_pickle(f'{config.VAN_DATA_PRCD}/processed_crime_data.pkl.gzip', compression='gzip')

In [15]:
# Create a pivot table with all cell coordinates as columns and all date and crime category as indices
crime_pivot = utils.getPivot(data=processed_crime_data, values='TYPE', index=['DATE','CAT'], 
                             columns=['CELL_X','CELL_Y'], aggfunc='count', n_bins = n_bins, allcombs=True)

In [16]:
crime_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 5)","(1, 6)","(1, 7)","(1, 8)","(1, 9)","(1, 10)",...,"(26, 17)","(26, 18)","(26, 19)","(26, 20)","(26, 21)","(26, 22)","(26, 23)","(26, 24)","(26, 25)","(26, 26)"
DATE,CAT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003-01-01,Break and Enter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003-01-01,Homicide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003-01-01,Mischief,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003-01-01,Assualt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003-01-01,Theft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-02-11,Homicide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-11,Mischief,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-11,Assualt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-11,Theft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Create an array of the pivot table values
crime_arr = crime_pivot.values

In [18]:
# Reshape the array to have 26x26 grids with crime categories number of channels.
crime_arr_reshaped = crime_arr.reshape((-1,len(config.CRIME_CATS),n_bins,n_bins))

In [19]:
crime_arr_reshaped.shape

(6982, 6, 26, 26)

In [20]:
# Group instances to pairs of sequence length and collect corresponding target sample
features, targets = utils.getFeaturesTargets(data=crime_arr_reshaped, seq_len=config.SEQ_LEN)

In [21]:
# Convert features and targets to arrays
features = np.array(features)
targets = np.array(targets)

In [22]:
# Divide features and targets into train and test. 15 years of data is used for training (2003-2017) 
#, 1 year of data for validation (2018-2019) and ~ 1 year of data for testing (2020-2021+)
features_train = features[:5479,:]
features_val = features[5479:6209,:]
features_test = features[6209:,:]
targets_train = targets[:5479,:]
targets_val = targets[5479:6209,:]
targets_test = targets[6209:,:]

In [23]:
# Save the features and targets as pickle files
with h5py.File(f'{config.VAN_DATA_PRCD}/features.h5', 'w') as hf:
    hf.create_dataset("train",  data=features_train)
    hf.create_dataset("val", data=features_val)
    hf.create_dataset("test", data=features_test)
    
with h5py.File(f'{config.VAN_DATA_PRCD}/targets.h5', 'w') as hf:
    hf.create_dataset("train",  data=targets_train)
    hf.create_dataset("val",  data=targets_val)
    hf.create_dataset("test",  data=targets_test)
   

In [11]:
# Import secondary data related to housing price index, consumer price index and weather
sec_data = pd.read_csv('../data/raw/cpi_hpi_weather_data.csv')
sec_data = sec_data.rename(columns={'housing price indexes': 'housing_price_index'})

In [12]:
# List of columns to be filtered
imp_cols = ['housing_price_index','consumer_price_index','min_temperature','max_temperature','max_rain','max_snow']

In [13]:
#  Filter required columns
sec_data_fil = sec_data[imp_cols]

In [14]:
# Find a rolling mean of sequence length
sec_data_mean=list()
for i in range(0,len(sec_data_fil)-(config.SEQ_LEN+1)):
    sec_data_mean.append(sec_data_fil[i:i+config.SEQ_LEN].mean())

sec_data_mean_arr = np.array(sec_data_mean)

In [15]:
# Split the secondary data into train, validation and test
sec_data_mean_arr_train = sec_data_mean_arr[:5479,:]
sec_data_mean_arr_val = sec_data_mean_arr[5479:6209,:]
sec_data_mean_arr_test = sec_data_mean_arr[6209:,:]

In [16]:
# Save data
with h5py.File(f'{config.VAN_DATA_PRCD}/sec_features.h5', 'w') as hf:
    hf.create_dataset("train",  data=sec_data_mean_arr_train)
    hf.create_dataset("val", data=sec_data_mean_arr_val)
    hf.create_dataset("test", data=sec_data_mean_arr_test)
