In [1]:
import math
import pandas as pd 
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt

import h3 # h3 bins from uber

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [3]:
import sys
sys.path.append('../Scripts')
import capstone_functions as cf

# Prediction models

## A. Identifying frequency outliers

**Idea:** Take historical data but cut off "frequency outliers" which occurred only once in the whole data set.

### Overall function

In [4]:
df_predictions_a = cf.rta_prediction_pipeline(type_of_pred="a", frequency_cutoff=1)

file created ../Inputs/predictions_for_clustering_a.csv


### Separate function steps

Load data and create temporal features and hex bins.

In [5]:
df_raw = cf.create_crash_df()
df = cf.create_temporal_features(df_raw)
df = cf.assign_hex_bin(df)

Create empty data frame for all hex bin / time window combinations

In [6]:
df_pred_template = cf.create_pred_template(df)

Create data frame for all RTA's with the respective hex bin / time window combinations

In [7]:
df_tw_hex = cf.rta_per_time_window_hex_bin(df)

Merge the RTA's onto the empty data frame

In [8]:
df_merged = cf.fill_overall_df(df_pred_template, df_tw_hex)

In [9]:
df_merged.head()

Unnamed: 0,datetime,hex_bins,time_window_key,RTA
0,2018-01-01 00:00:00,867a6e417ffffff,2018-1-1-0,1.0
1,2018-01-01 03:00:00,867a6e417ffffff,2018-1-1-1,1.0
2,2018-01-01 06:00:00,867a6e417ffffff,2018-1-1-2,0.0
3,2018-01-01 09:00:00,867a6e417ffffff,2018-1-1-3,0.0
4,2018-01-01 12:00:00,867a6e417ffffff,2018-1-1-4,0.0


Generate list of frequency outliers

In [10]:
list_freq_outliers = cf.generate_outlier_list(df_merged, frequency_cutoff=1)

Filter overall data frame to exclude frequency outliers

In [11]:
df_pred_a = cf.filter_df_for_pred_a(df, list_freq_outliers)

Create latitude and longitude

In [12]:
df_pred_a_lat_long = cf.convert_h3_to_lat_lon(df_pred_a)

### Output

Overview of output file

In [13]:
df_pred_a_lat_long.head()

Unnamed: 0,datetime,latitude,longitude
0,2018-01-01 00:25:46,-1.190764,36.911613
1,2018-01-01 02:02:39,-0.691592,37.199105
2,2018-01-01 02:31:49,-0.691592,37.199105
3,2018-01-01 03:04:01,-1.265393,36.817341
4,2018-01-01 03:58:49,-1.190764,36.911613


In [14]:
df_pred_a_lat_long.shape

(6270, 3)

Export to csv

In [15]:
cf.export_df_to_csv(df_pred_a_lat_long,path_file='../Inputs/predictions_for_clustering_a.csv')

file created ../Inputs/predictions_for_clustering_a.csv


***

## B. Using RTA frequency as a prediction measure

**Idea**: For each hex bin, use the frequencies (sum of occurrences, not the magnitude) for each time window as a prediction value

### Overall function

In [16]:
df_predictions_b = cf.rta_prediction_pipeline(type_of_pred="b", frequency_cutoff=1)

file created ../Inputs/predictions_for_clustering_b.csv


### Separate function steps

Reduce RTA magnitude to 1 to predict based solely on frequency

In [17]:
df_pred_b = cf.filter_df_for_pred_b(df_merged, list_freq_outliers)

Clean up data frame

In [18]:
df_pred_b_clean = cf.clean_pred_b(df_pred_b)

Create latitutde and longitude

In [19]:
df_pred_b_lat_long = cf.convert_h3_to_lat_lon(df_pred_b_clean)

### Output

In [20]:
df_pred_b_lat_long.head()

Unnamed: 0,datetime,latitude,longitude
0,2018-01-01 00:01:00,-1.190764,36.911613
1,2018-01-01 00:01:00,-0.691592,37.199105
2,2018-01-01 03:01:00,-1.265393,36.817341
3,2018-01-01 03:01:00,-1.153477,36.958711
4,2018-01-01 03:01:00,-1.190764,36.911613


In [21]:
df_pred_b_lat_long.shape

(4227, 3)

Export to csv

In [22]:
cf.export_df_to_csv(df_pred_b_lat_long,path_file='../Inputs/predictions_for_clustering_b.csv')

file created ../Inputs/predictions_for_clustering_b.csv


***

## C. Using weather data to predict RTA occurrence (yes/no?) per time window and hex_bin class

**Idea**: Adds weather data (data per day) to B and fits a regression model on this weather data for all hex bins.

### Overall function

In [23]:
df_predictions_c = cf.rta_prediction_pipeline(type_of_pred="c", frequency_cutoff=1)

file created ../Inputs/predictions_for_clustering_c.csv


### Generate predictions

Import weather data

In [24]:
df_weather = pd.read_csv('../Inputs/Weather_Nairobi_Daily_GFS.csv', parse_dates=['Date'])

In [25]:
predicted_rta = cf.predict_accidents_on_weather(df_raw, df_weather)
predicted_rta_round = [int(round(i, 0)) for i in predicted_rta]

### Separate function steps

Create sample basis for each weekday

In [26]:
df_samples = cf.create_samples(df_merged, list_freq_outliers)

Generate overall prediction based on RTA's per day as predicted by daily weather and the sampling base

In [27]:
df_pred_c = cf.generate_predictions(df_samples, predicted_rta_round)

Clean up data frame

In [28]:
df_pred_c_clean = cf.reduce_to_time_windows(df_pred_c)

Create latitude and longitude

In [29]:
df_pred_c_lat_long = cf.convert_h3_to_lat_lon(df_pred_c_clean)

### Output

In [30]:
df_pred_c_lat_long.head()

Unnamed: 0,datetime,latitude,longitude
0,2019-07-01 06:01:00,-1.265393,36.817341
1,2019-07-01 09:01:00,-1.265393,36.817341
2,2019-07-01 18:01:00,-1.265393,36.817341
3,2019-07-01 12:01:00,-1.265393,36.817341
4,2019-07-01 06:01:00,-1.228069,36.86449


In [31]:
df_pred_c_lat_long.shape

(2180, 3)

Export to csv

In [33]:
cf.export_df_to_csv(df_pred_c_lat_long,path_file='../Inputs/predictions_for_clustering_c.csv')

file created ../Inputs/predictions_for_clustering_c.csv


***