# RNN Prediction Model for crimes in the México City

## Setting up

### Importing Libraries

In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)


#TensorFlow
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.callbacks import EarlyStopping


### URL to get data set

In [65]:
url = "https://storage.googleapis.com/safetymap/preprocessed_data3.csv"
df = pd.read_csv(url)
df.drop(columns = "colonia_id", inplace = True)
pre_data = df.set_index(["year_month_hecho","alcaldia_colonia"]).unstack("alcaldia_colonia")

### Train-test split


In [106]:
# Input lengh for the train-test split function and the ratio of the Folds

INPUT_LENGTH = 1 * 12 # records every 1 month x 12 months per year = 12 months
TRAIN_TEST_RATIO = 0.70 #70% of the data is going to be for training

#### Train-test-split Function

In [138]:
from typing import Tuple

def train_test_split(fold:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> Tuple[pd.DataFrame]:
    """From a fold dataframe, take a train dataframe and test dataframe based on 
    the split ratio.
    - df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    - df_test should contain all the timesteps needed to create all (X_test, y_test) tuples

    Args:
        fold (pd.DataFrame): A fold of timesteps
        train_test_ratio (float): The ratio between train and test 0-1
        input_length (int): How long each X_i will be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (fold_train, fold_test)
    """
    fold_train = fold[0:round(len(fold)*train_test_ratio)]
    fold_test = fold[(round(len(fold)*train_test_ratio - input_length)):]
    return fold_train,fold_test

In [108]:
# Running the T-T-Split function to get Fold_train and Fold_test

(fold_train, fold_test) = train_test_split(pre_data, TRAIN_TEST_RATIO, INPUT_LENGTH)

### (X_train, y_train) and (X_test, y_test)

#### get_X_y_strides Function


In [135]:
def get_X_y_strides(fold: pd.DataFrame, input_length: int, output_length: int, 
    sequence_stride: int) -> Tuple[np.array]:
    """slides through a `fold` Time Series (2D array) to create sequences of equal
        * `input_length` for X,
        * `output_length` for y,
    using a temporal gap `sequence_stride` between each sequence

    Args:
        fold (pd.DataFrame): One single fold dataframe
        input_length (int): Length of each X_i
        output_length (int): Length of each y_i
        sequence_stride (int): How many timesteps to take before taking the next X_i

    Returns:
        Tuple[np.array]: A tuple of numpy arrays (X, y)
    """
    #X, y = [], []

    for i in range(0, len(fold), sequence_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (i + input_length + output_length) >= len(fold):
            break
        X_i_transformed = fold.iloc[i:i + input_length, :]
        y_i_transformed = fold.iloc[i + input_length:i + input_length + output_length, :][TARGET]


        fold_train_list = X_i_transformed.stack("alcaldia_colonia").groupby(["alcaldia_colonia", "year_month_hecho"])\
                            .apply(lambda x: x.values.tolist()[0])\
                            .groupby("alcaldia_colonia").apply(lambda x: x.values.tolist())\
                            .tolist()

        fold_test_list = y_i_transformed.stack("alcaldia_colonia").groupby(["alcaldia_colonia", "year_month_hecho"])\
                            .apply(lambda x: x.values.tolist()[0])\
                            .groupby("alcaldia_colonia").apply(lambda x: x.values.tolist())\
                            .tolist()

    return (np.array(fold_train_list), np.array(fold_test_list))

In [136]:
# Defining the sequence stride, output length and target

SEQUENCE_STRIDE = 1
OUTPUT_LENGTH = 12
TARGET = ['burglary', 'danger_of_well-being',
       'domestic_violence', 'fraud', 'homicide', 'property_damage',
       'robbery_with_violence', 'robbery_without_violence', 'sexual_crime',
       'threats']

In [137]:
# Running the Train function for X and y
X_train, y_train = get_X_y_strides(fold_train, INPUT_LENGTH, OUTPUT_LENGTH, SEQUENCE_STRIDE)

# Running the Test functeion for X and y
X_test, y_test = get_X_y_strides(fold_test, INPUT_LENGTH, OUTPUT_LENGTH, SEQUENCE_STRIDE)

print(X_train.shape, y_train.shape,X_test.shape, y_test.shape)

(2041, 12, 10) (2041, 12, 10) (2041, 12, 10) (2041, 12, 10)


## Model

In [121]:
def init_model(X_train, y_train):
    model = Sequential()

    # –– Model
    model.add(layers.Masking(mask_value=-1, input_shape=(12,10)))
    model.add(layers.LSTM(units=40, activation='tanh', return_sequences =True))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(rate=0.2))  # The rate is the percentage of neurons that are "killed"
    model.add(layers.Dense(10, activation='relu'))

    # –– Compilation
    model.compile(loss='mse', 
                  optimizer='adam',
                 metrics = ["mae"]) 

    return model 

In [123]:
# Early Stopping with patience 10 
es = EarlyStopping(patience=10)

model = init_model(X_train, y_train)
model.summary()
history = model.fit(X_train, y_train, 
          epochs=200, 
          batch_size=32, 
          verbose=1, 
          callbacks = [es],
          validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 12, 10)            0         
                                                                 
 lstm (LSTM)                 (None, 12, 40)            8160      
                                                                 
 dense (Dense)               (None, 12, 50)            2050      
                                                                 
 dropout (Dropout)           (None, 12, 50)            0         
                                                                 
 dense_1 (Dense)             (None, 12, 10)            510       
                                                                 
Total params: 10720 (41.88 KB)
Trainable params: 10720 (41.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/200
Epoch 2/

In [124]:
# Model Evaluation with test sets
model.evaluate(X_test,y_test)



[1.70176100730896, 0.6854937076568604]

##  Predictions

In [126]:
predictions = model.predict(X_test)



In [128]:
# Date Series for the prediction dataframe
start_date = '2023-01-01'
num_periods = 12

date_range = pd.date_range(start=start_date, periods=num_periods, freq='MS')
date_series = pd.Series(range(num_periods), index=date_range)

In [134]:
# Empty list to save the list of dataframes
p1 = []
nom_delitos = df.columns[2:]
nom_colonias = df.alcaldia_colonia.unique()

for period in range(predictions.shape[1]):
    p1.append(pd.DataFrame(predictions[:, period, :], columns =nom_delitos, index = nom_colonias).assign(periodo= date_series.index[period].date()))

new_prediction = pd.concat(p1)

In [132]:
prediction_dataframe = new_prediction.set_index("periodo",append=True).round(0).astype(int)

In [133]:
prediction_dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,burglary,danger_of_well-being,domestic_violence,fraud,homicide,property_damage,robbery_with_violence,robbery_without_violence,sexual_crime,threats
Unnamed: 0_level_1,periodo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ALVARO OBREGON || 16 DE SEPTIEMBRE,2023-01-01,0,0,0,0,0,0,0,0,0,0
ALVARO OBREGON || 19 DE MAYO,2023-01-01,0,0,0,0,0,0,0,0,0,0
ALVARO OBREGON || 1RA VICTORIA,2023-01-01,0,0,1,2,0,0,1,2,0,1
ALVARO OBREGON || 1RA VICTORIA SECCION BOSQUES,2023-01-01,0,1,2,1,0,0,1,1,0,1
ALVARO OBREGON || 26 DE JULIO,2023-01-01,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
XOCHIMILCO || VILLA XOCHIMILCO (U HAB),2023-12-01,0,0,0,0,0,0,0,0,0,0
XOCHIMILCO || VILLAS DE LOS TRABAJADORES (GDF) (U HAB),2023-12-01,0,0,0,0,0,0,0,0,0,0
XOCHIMILCO || VIVEROS DE COACTETLAN,2023-12-01,0,0,0,0,0,0,0,0,0,0
XOCHIMILCO || XALTOCAN (BARR),2023-12-01,2,2,6,5,1,2,2,7,1,4
