In [1]:
import math
import pandas as pd 
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt

import h3 # h3 bins from uber

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [3]:
import sys
sys.path.append('../Scripts')
import capstone_functions as cf

# Prediction models

## A. Identifying frequency outliers

**Idea:** Take historical data but cut off "frequency outliers" which occurred only once in the whole data set.

### Overall function

In [None]:
df_predictions_a = cf.rta_prediction_pipeline(type_of_pred="a", frequency_cutoff=1)

### Separate function steps

Load data and create temporal features and hex bins.

In [4]:
df_raw = cf.create_crash_df()
df = cf.create_temporal_features(df_raw)
df = cf.assign_hex_bin(df)

Create empty data frame for all hex bin / time window combinations

In [5]:
df_pred_template = cf.create_pred_template(df)

Create data frame for all RTA's with the respective hex bin / time window combinations

In [6]:
df_tw_hex = cf.rta_per_time_window_hex_bin(df)

Merge the RTA's onto the empty data frame

In [7]:
df_merged = cf.fill_overall_df(df_pred_template, df_tw_hex)

In [8]:
df_merged.head()

Unnamed: 0,datetime,hex_bins,time_window_key,RTA
0,2018-01-01 00:00:00,867a6e417ffffff,2018-1-1-0,1.0
1,2018-01-01 03:00:00,867a6e417ffffff,2018-1-1-1,1.0
2,2018-01-01 06:00:00,867a6e417ffffff,2018-1-1-2,0.0
3,2018-01-01 09:00:00,867a6e417ffffff,2018-1-1-3,0.0
4,2018-01-01 12:00:00,867a6e417ffffff,2018-1-1-4,0.0


Generate list of frequency outliers

In [9]:
list_freq_outliers = cf.generate_outlier_list(df_merged, frequency_cutoff=1)

Filter overall data frame to exclude frequency outliers

In [10]:
df_pred_a = cf.filter_df_for_pred_a(df, list_freq_outliers)

### Output

Overview of output file

In [11]:
df_pred_a.head()

Unnamed: 0,datetime,h3_zone_6
0,2018-01-01 00:25:46,867a6e417ffffff
1,2018-01-01 02:02:39,867a45107ffffff
2,2018-01-01 02:31:49,867a45107ffffff
3,2018-01-01 03:04:01,867a6e42fffffff
4,2018-01-01 03:58:49,867a6e417ffffff


In [None]:
df_pred_a.shape

Export to csv

In [None]:
cf.export_df_to_csv(predictions_for_clustering_a,path_file='../Inputs/predictions_for_clustering_a.csv')

***

## B. Using RTA frequency as a prediction measure

**Idea**: For each hex bin, use the frequencies (sum of occurrences, not the magnitude) for each time window as a prediction value

### Overall function

In [None]:
df_predictions_b = cf.rta_prediction_pipeline(type_of_pred="b", frequency_cutoff=1)

### Separate function steps

Reduce RTA magnitude to 1 to predict based solely on frequency

In [12]:
df_pred_b = cf.filter_df_for_pred_b(df_merged, list_freq_outliers)

Clean up data frame

In [13]:
df_pred_b_clean = cf.clean_pred_b(df_pred_b)

### Output

In [14]:
df_pred_b_clean.head()

Unnamed: 0,datetime,hex_bins
0,2018-01-01 00:01:00,867a6e417ffffff
1,2018-01-01 00:01:00,867a45107ffffff
2,2018-01-01 03:01:00,867a6e42fffffff
3,2018-01-01 03:01:00,867a6e4a7ffffff
4,2018-01-01 03:01:00,867a6e417ffffff


In [15]:
df_pred_b_clean.shape

(4227, 2)

Export to csv

In [None]:
cf.export_df_to_csv(predictions_for_clustering_b,path_file='../Inputs/predictions_for_clustering_b.csv')

***

## C. Using weather data to predict RTA occurrence (yes/no?) per time window and hex_bin class

**Idea**: Adds weather data (data per day) to B and fits a regression model on this weather data for all hex bins.

### Overall function

In [16]:
df_predictions_c = cf.rta_prediction_pipeline(type_of_pred="c", frequency_cutoff=1)

file created ../Inputs/predictions_for_clustering_c.csv


### Generate predictions

Import weather data

In [None]:
df_weather = pd.read_csv('../Inputs/Weather_Nairobi_Daily_GFS.csv', parse_dates=['Date'])

In [None]:
# Sample to showcase POC
predicted_rta = [12, 23, 11, 15, 9 , 18, 19, 22, 9, 7, 15, 16, 17, 8, 9, 19]

In [None]:
#predicted_rta_new = cf.predict_accidents_on_weather(df_raw, df_weather)

### Separate function steps

Create sample basis for each weekday

In [None]:
df_samples = cf.create_samples(df_merged, list_freq_outliers)

Generate overall prediction based on RTA's per day as predicted by daily weather and the sampling base

In [None]:
df_pred_c = cf.generate_predictions(df_samples, predicted_rta)

Clean up data frame

In [None]:
df_pred_c_clean = cf.reduce_to_time_windows(df_pred_c)

### Output

In [None]:
df_pred_c_clean.head()

In [None]:
df_pred_c_clean.shape

Export to csv

In [None]:
export_df_to_csv(df_pred_c_clean,path_file='../Inputs/predictions_for_clustering_c.csv')

***