In [1]:
import math
import pandas as pd 
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt

import h3 # h3 bins from uber

## Loading helper functions

These functions have been copied over from notebooks that already exist for this project. Ideally, these would be imported from a module.

In [2]:
#Loading the data
def create_crash_df(train_file = '../Inputs/Train.csv'):  
    crash_df = pd.read_csv(train_file, parse_dates=['datetime'])
    return crash_df

#Creating temporal features like months, weekdays, etc.
def create_temporal_features(df):
    dict_windows = {1: "00-03", 2: "03-06", 3: "06-09", 4: "09-12", 5: "12-15", 6: "15-18", 7: "18-21", 8: "21-24"}
    dict_months = {1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
               7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"}
    df["time_window"] = df["datetime"].apply(lambda x: math.floor(x.hour / 3) + 1)
    df["time_window_str"] = df["time_window"].apply(lambda x: dict_windows.get(x))
    df["day"] = df["datetime"].apply(lambda x: x.day)
    df["month"] = df["datetime"].apply(lambda x: dict_months.get(x.month))
    df["year"] = df["datetime"].apply(lambda x: x.year)
    df["weekday"] = df["datetime"].apply(lambda x: x.weekday())
    return df

#Exporting the dataframe back to csv
def export_df_to_csv(df,path_file='../Inputs/train_h3.csv'):
    df.to_csv(path_file,index=False)
    print(f'file created {path_file}') 

#Joins provided data sets for road segments into one file, indicating latitude and longitude for each segment. This enables placement in h3 bins.    
def join_segment_files(path='../Inputs/', road_surveys='Segment_info.csv',segments_geometry='segments_geometry.geojson'):
    ''' 
        Load the survey data, Load the segment geometry, Join the two segment dfs.
        return a combined dataframe
    '''
    road_surveys = pd.read_csv(path+road_surveys)
    road_segment_locs = gpd.read_file(path+segments_geometry)
    segments_merged = pd.merge(road_segment_locs, road_surveys, on='segment_id', how='left')
    segments_merged["longitude"] = segments_merged.geometry.centroid.x
    segments_merged["latitude"] = segments_merged.geometry.centroid.y
    segments_merged = assign_hexbin(segments_merged)
    return segments_merged

#Defines time clusters, see docstring for more info
def assign_TW_cluster(weekday, time_window, holiday=0, strategy='baseline'):
    '''
    Can be used in a lambda function to return the time window cluster for a given day and time window.
    e.g. crash_df["cluster"] = crash_df.apply(lambda x: return_TW_cluster(x.weekday, x.time_window_str) ,axis=1)
    This is called by the function: create_cluster_feature.
    '''
    if strategy == 'baseline':
        return 'baseline'
    
    if strategy == 'mean_shift_modified':
        if weekday == 7:
            return 'off_peak'        
        elif weekday == 6:
            return 'off_peak'
        elif weekday in [0,1,2,3,4]:
            if time_window in ["06-09"]:
                return 'peak'
            elif time_window in ["09-12", "12-15", "15-18", "18-21"]:
                return 'middle'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'    
        elif weekday == 5:
            if time_window in ["06-09", "12-15", "15-18", "18-21"]:
                return 'middle'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'
            elif time_window in ["09-12"]:
                return 'peak'
    
    elif strategy == 'saturday_2':
        if weekday == 7:
            return 'off_peak'        
        elif weekday == 6:
            return 'off_peak'
        elif weekday in [0,1,2,3,4]:
            if time_window in ["06-09"]:
                return 'peak'
            elif time_window in ["09-12", "12-15", "15-18", "18-21"]:
                return 'middle'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'    
        elif weekday == 5:
            if time_window in ["06-09", "12-15", "15-18", "18-21"]:
                return 'saturday_busy'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'
            elif time_window in ["09-12"]:
                return 'saturday_busy'    
    
    elif strategy == 'holiday_7':
        if weekday == 7:
            return 'holiday'        
        elif weekday == 6:
            return 'sunday'
        elif weekday in [0,1,2,3,4]:
            if time_window in ["06-09"]:
                return 'peak'
            elif time_window in ["09-12", "12-15", "15-18", "18-21"]:
                return 'middle'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'    
        elif weekday == 5:
            if time_window in ["06-09", "12-15", "15-18", "18-21"]:
                return 'saturday_busy'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'
            elif time_window in ["09-12"]:
                return 'saturday_busy'      

    elif strategy == 'holiday_7':
        if weekday == 7:
            return 'holiday'        
        elif weekday == 6:
            return 'sunday'
        elif weekday in [0,1,2,3,4]:
            if time_window in ["06-09"]:
                return 'peak'
            elif time_window in ["09-12", "12-15", "15-18", "18-21"]:
                return 'middle'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'    
        elif weekday == 5:
            if time_window in ["06-09", "12-15", "15-18", "18-21"]:
                return 'saturday_busy'
            elif time_window in ["00-03", "03-06", "21-24"]:
                return 'off_peak'
            elif time_window in ["09-12"]:
                return 'saturday_busy'      
    

    elif strategy == 'no_cluster':
        return (str(weekday)+str(time_window)+str(holiday))

#Creates time cluster feature in existing data frame
def create_cluster_feature(crash_df, strategy='baseline', verbose=0):
    '''
    Function takes crash df and creates new column with tw cluster labels.
    If verbose is increased, the time window clusters will be visualised.
    '''
    crash_df["cluster"] = crash_df.apply(lambda x: 
                                         assign_TW_cluster(weekday=x.weekday,
                                                           time_window=x.time_window_str,
                                                           strategy=strategy) 
                                         ,axis=1)
    
    print(f'{crash_df.cluster.nunique()} clusters created')
    if verbose > 1:
        tb_clusters = sns.FacetGrid(crash_df,hue='cluster', height=5)
        tb_clusters.map(sns.stripplot,'weekday', 'time_window_str', s=20, 
                                       order = ['00-03', '03-06', '06-09', '09-12', 
                                                '12-15', '15-18', '18-21', '21-24'],
                                    label = 'Time Window Clusters')
    return crash_df

# Prediction models

## A. Identifying frequency outliers

Take historical data but cut off "frequency outliers" which occurred only once in the whole data set -> [provide function to generate list of frequency outlier hex bins].

**New helper function**

In [3]:
def assign_hexbin(df,lat_column="latitude",lon_column="longitude", hexbin_resolution=6):
    '''Assigning hex bins based on h3 classification and latitude and longitude'''
    df["h3_zone_{}".format(hexbin_resolution)] = df.apply(lambda x: h3.geo_to_h3(x[lat_column], x[lon_column], hexbin_resolution),axis=1)
    return df

In [4]:
df_raw = create_crash_df()
df = create_temporal_features(df_raw)
df = assign_hexbin(df)

In [5]:
df_cluster = create_cluster_feature(df, strategy='mean_shift_modified', verbose=0)

3 clusters created


**New helper function**

In [6]:
def rta_per_cluster_and_bins(df_cluster):
    '''Add up RTA's per hex bin and time custer'''
    df_rta = df_cluster.groupby([df_cluster.columns[-2], "cluster"]).agg({"uid": "count"}).reset_index()
    col_names = [df_rta.columns[0]] + [df_rta.columns[1]] + ["RTA"]
    df_rta.columns = col_names
    return df_rta

In [7]:
df_rta = rta_per_cluster_and_bins(df_cluster)

**New helper function**

In [8]:
def get_list_of_h3(df_hex, df_clusters):
    '''Create list of unique hex bins times unique time clusters'''
    return list(set(df_hex[df_hex.columns[0]])) * df_clusters["cluster"].nunique()

In [9]:
hex_cluster_comb = get_list_of_h3(df_rta, df_cluster)

**New helper function**

In [10]:
def get_list_of_states(df_clusters, df_hex):
    '''Create list of time clusters'''
    states = []
    for state in df_clusters["cluster"].unique():
        states += ([state] * df_hex[df_hex.columns[0]].nunique())
    return states

In [11]:
states = get_list_of_states(df_cluster, df_rta)

**New helper function**

In [12]:
def create_empty_df(list_hex_bins, list_time_clusters):
    '''Create an empty data frame of format hex bins * time clusters'''
    df_empty = pd.DataFrame(data=[list_hex_bins, list_time_clusters]).T
    df_empty.columns = [df_rta.columns[0], "cluster"]
    return df_empty

In [13]:
df_empty = create_empty_df(hex_cluster_comb, states)

**New helper function**

In [14]:
def fill_df_hex_rta(df_empty, df_rta):
    '''Join road traffic accidents onto empty data frame'''
    df_merged = pd.merge(df_empty, df_rta, on=[df_empty.columns[0], df_empty.columns[1]], how="outer")
    df_filled = df_merged.fillna(0)
    df_filled = df_filled.sort_values(by=[df_empty.columns[0], df_empty.columns[1]])
    return df_filled

In [15]:
df_filled = fill_df_hex_rta(df_empty, df_rta)

**New helper function**

In [16]:
def create_raw_pred_input_df(df_hex_bins):
    '''Based on hex bin resolution creates an empty data frame for each 3 hour time window for each hex bin.
     This results in a n * 2 dataframe (columns: time_windows, hex_bins) where number of rows equals hex_bins * 4369.
     4369 is the result of days between start and end date (in days) * 8 time windows per day (24 / 3 hours)'''
    #Create dataframe to get the accurate amount of 3-hour time windows for the desired time frame
    date_start = '2018-01-01'
    date_end = '2019-07-01'
    dates = pd.date_range(date_start, date_end, freq='3h')
    all_days_df = pd.DataFrame(dates, columns=["dates"])

    time_windows = list(all_days_df["dates"])
    len_windows = all_days_df.shape[0]
    list_unique_hexbins = df_hex_bins[df_hex_bins.columns[0]].unique()
    
    list_bins_per_window = []
    list_time_windows = []
    
    for i in range(0, len(list_unique_hexbins)):
        list_bins_per_window += len_windows * [list_unique_hexbins[i]]
        list_time_windows += time_windows
        
    input_df = {"time_windows": list_time_windows, "hex_bins": list_bins_per_window}
    df_pred_template = pd.DataFrame(data=input_df)
    
    return df_pred_template

In [17]:
df_raw_pred = create_raw_pred_input_df(df_filled)

In [18]:
df_raw_pred["time_window_key"] = df_raw_pred["time_windows"].apply(lambda x: str(x.year) + "-" + str(x.month) + "-" + str(x.day) + "-" + str(math.floor(x.hour / 3)))
df_cluster["time_window_key"] = df_cluster["datetime"].apply(lambda x: str(x.year) + "-" + str(x.month) + "-" + str(x.day) + "-" + str(math.floor(x.hour / 3)))

**New helper function**

In [19]:
def rta_per_time_window(df_cluster):
    '''Add up RTA's per time window'''
    df_tw = df_cluster.groupby(["time_window_key", "h3_zone_6"]).agg({"uid": "count"}).reset_index()
    col_names = ["time_window_key"] + ["hex_bins"] + ["RTA"]
    df_tw.columns = col_names
    return df_tw

In [20]:
df_tw = rta_per_time_window(df_cluster)

**New helper function**

In [21]:
def fill_overall_df(df_raw_pred, df_rta_per_tw):
    '''Join road traffic accidents onto empty data frame that consists of time windows (8 per day) for all days (1.5 years) for all hex bins. 
    For combinations with no accidents, NaNs will be converted into 0.'''
    df_merged = pd.merge(df_raw_pred, df_rta_per_tw, on=["time_window_key", "hex_bins"], how="outer")
    df_merged = df_merged.fillna(0)
    return df_merged

In [22]:
df_raw_filled = fill_overall_df(df_raw_pred, df_tw)

list_of_c = list(df_raw_filled.columns)
list_of_c[0] = "datetime"
df_raw_filled.columns = list_of_c

In [23]:
df_raw_filled = create_temporal_features(df_raw_filled)

In [24]:
df_final = create_cluster_feature(df_raw_filled, strategy='mean_shift_modified', verbose=0)

3 clusters created


In [25]:
df_classes = df_final.groupby("hex_bins")
df_classes = df_classes.agg({'RTA': [np.mean, np.std, np.sum, np.count_nonzero]})
df_classes = df_classes.reset_index()

In [26]:
df_classes.columns = ["hex_bins", "RTA_mean", "RTA_std", "RTA_sum", "RTA_nonzero"]

In [27]:
df_freq_outliers = df_classes.loc[df_classes["RTA_nonzero"] == 1]

In [28]:
list_freq_outliers = df_freq_outliers["hex_bins"].values
#list(list_freq_outliers)

In [29]:
list_freq_outliers

array(['867a45067ffffff', '867a45077ffffff', '867a4511fffffff',
       '867a4512fffffff', '867a45147ffffff', '867a4515fffffff',
       '867a45177ffffff', '867a45817ffffff', '867a4584fffffff',
       '867a4585fffffff', '867a458dfffffff', '867a458f7ffffff',
       '867a45a8fffffff', '867a45b0fffffff', '867a45b17ffffff',
       '867a45b67ffffff', '867a45b77ffffff', '867a6141fffffff',
       '867a614d7ffffff', '867a616b7ffffff', '867a6304fffffff',
       '867a632a7ffffff', '867a63307ffffff', '867a6331fffffff',
       '867a6360fffffff', '867a63667ffffff', '867a6396fffffff',
       '867a656c7ffffff', '867a65797ffffff', '867a6e18fffffff',
       '867a6e1b7ffffff', '867a6e4c7ffffff', '867a6e517ffffff',
       '867a6e59fffffff', '867a6e5a7ffffff', '867a6e5b7ffffff',
       '867a6e657ffffff', '867a6e737ffffff', '867a6e797ffffff',
       '867a6e79fffffff', '867a6e7b7ffffff', '867a6ecf7ffffff',
       '867a6ed47ffffff', '867a6ed97ffffff', '867a6eda7ffffff'],
      dtype=object)

In [30]:
#Output the list of hex bins to exclude in a .csv file

***

## B. Using RTA frequency as a prediction measure

For each hex bin, use the frequencies (sum of occurrences, not the magnitude) for each time window as a prediction value -> [provide function to generate data frame of 56 (3 hour) time windows for each hex bin].

In [31]:
df_final.head()

Unnamed: 0,datetime,hex_bins,time_window_key,RTA,time_window,time_window_str,day,month,year,weekday,cluster
0,2018-01-01 00:00:00,867a44a6fffffff,2018-1-1-0,0.0,1,00-03,1,Jan,2018,0,off_peak
1,2018-01-01 03:00:00,867a44a6fffffff,2018-1-1-1,0.0,2,03-06,1,Jan,2018,0,off_peak
2,2018-01-01 06:00:00,867a44a6fffffff,2018-1-1-2,0.0,3,06-09,1,Jan,2018,0,peak
3,2018-01-01 09:00:00,867a44a6fffffff,2018-1-1-3,0.0,4,09-12,1,Jan,2018,0,middle
4,2018-01-01 12:00:00,867a44a6fffffff,2018-1-1-4,0.0,5,12-15,1,Jan,2018,0,middle


In [32]:
df_final.shape

(511173, 11)

### Helper function

In [33]:
def filter_hex_bins(df, list_hex_bins):
    """Filter out all hex bins on the list and reduce data frame to non-zero values"""
    
    # Filters overall dataframe to exclude hex bins with only one RTA occurrence in the whole timeframe (according to input list)
    df_freq_filtered = df.loc[~df["hex_bins"].isin(list_hex_bins)]
    
    # Also filters out all hex bin and time window combinations where no RTA occurred
    df_freq_filtered = df_freq_filtered.loc[df_freq_filtered["RTA"] > 0]
    
    return df_freq_filtered

In [34]:
df_freq_filtered = filter_hex_bins(df_final, list_freq_outliers)

In [35]:
df_freq_filtered.shape

(4227, 11)

In [36]:
df_freq_filtered.head()

Unnamed: 0,datetime,hex_bins,time_window_key,RTA,time_window,time_window_str,day,month,year,weekday,cluster
3514,2019-03-16 06:00:00,867a44a6fffffff,2019-3-16-2,1.0,3,06-09,16,Mar,2019,5,middle
4196,2019-06-09 12:00:00,867a44a6fffffff,2019-6-9-4,1.0,5,12-15,9,Jun,2019,6,off_peak
5678,2018-06-13 15:00:00,867a44b5fffffff,2018-6-13-5,1.0,6,15-18,13,Jun,2018,2,middle
8291,2019-05-06 06:00:00,867a44b5fffffff,2019-5-6-2,1.0,3,06-09,6,May,2019,0,peak
17476,2018-01-01 00:00:00,867a45107ffffff,2018-1-1-0,2.0,1,00-03,1,Jan,2018,0,off_peak


In [37]:
df_freq_filtered.hex_bins.nunique()

72

### Helper function

In [38]:
def make_predictions(df):
    """Dropping all redundant rows, fixing indices and making sure the time windows are hit."""

    # Remove some redundant rows and fix indices
    df_predictions = df.drop(["time_window_key", "time_window", "RTA", "time_window_str", "cluster",
                                                             "day", "month", "year", "weekday"], axis=1)
    df_predictions = df_predictions.reset_index()
    df_predictions.drop("index", axis=1, inplace=True)
    
    # Add 1 minute to have the RTA's lie inside the time window rather than on the verge, sort values and reset the index
    df_predictions["datetime"] = df_predictions["datetime"].apply(lambda x: x + pd.Timedelta(minutes=1))
    df_predictions = df_predictions.sort_values(by="datetime").reset_index()
    
    # Drop redundant columns
    df_predictions = df_predictions.drop("index", axis=1)
    
    return df_predictions

In [39]:
predictions_for_clustering_b = make_predictions(df_freq_filtered)

In [40]:
predictions_for_clustering_b.head()

Unnamed: 0,datetime,hex_bins
0,2018-01-01 00:01:00,867a6e417ffffff
1,2018-01-01 00:01:00,867a45107ffffff
2,2018-01-01 03:01:00,867a6e417ffffff
3,2018-01-01 03:01:00,867a6e42fffffff
4,2018-01-01 03:01:00,867a45107ffffff


In [41]:
predictions_for_clustering_b.hex_bins.nunique()

72

### Output to .csv

In [None]:
export_df_to_csv(predictions_for_clustering_b,path_file='../Inputs/predictions_for_clustering_b.csv')

***

## C. Using weather data to predict RTA occurrence (yes/no?) per time window and hex_bin class

Adds weather data (data per day) to B and fits a regression model on this weather data for all hex bins.

### Helper function

In [42]:
def create_freq_per_tw(df, list_freq_outliers):
    """Creates a sort of distribution from which hex bin and time window combinations are drawn, subject to the predicted RTA's per day"""
    
    # Filtering for hex bins with only one occurrence
    df_filter = df.loc[~df["hex_bins"].isin(list_freq_outliers)]
    
    # asdadad
    df_freq = df_filter.groupby(["hex_bins", "weekday", "time_window_str"])
    df_rta_freq = df_freq.agg({'RTA': [np.count_nonzero]})
    df_rta_freq = df_rta_freq.reset_index()
    df_rta_freq.columns = ["hex_bins", "weekday", "time_window", "RTA_freq"]
    
    return df_rta_freq

In [43]:
df_cleaned = create_freq_per_tw(df_final, list_freq_outliers)

In [44]:
df_cleaned.head()

Unnamed: 0,hex_bins,weekday,time_window,RTA_freq
0,867a44a6fffffff,0,00-03,0.0
1,867a44a6fffffff,0,03-06,0.0
2,867a44a6fffffff,0,06-09,0.0
3,867a44a6fffffff,0,09-12,0.0
4,867a44a6fffffff,0,12-15,0.0


In [45]:
df_cleaned.shape

(4032, 4)

**Still need to get the Predictions from Andreas in here**

In [None]:
# Sample to showcase POC
predicted_rta = [12, 23, 11, 15, 9 ,]

### Helper function

In [46]:
def generate_predictions(df, predicted_rta):
    """Takes a dataframe containing the RTA frequency per weekday and time window and the predicted RTA's per day and turns this into a prediction dataframe."""

    df_monday = df.loc[df["weekday"] == 0].sort_values(by="RTA_freq", ascending=False)
    df_tuesday = df.loc[df["weekday"] == 1].sort_values(by="RTA_freq", ascending=False)
    df_wednesday = df.loc[df["weekday"] == 2].sort_values(by="RTA_freq", ascending=False)
    df_thursday = df.loc[df["weekday"] == 3].sort_values(by="RTA_freq", ascending=False)
    df_friday = df.loc[df["weekday"] == 4].sort_values(by="RTA_freq", ascending=False)
    df_saturday = df.loc[df["weekday"] == 5].sort_values(by="RTA_freq", ascending=False)
    df_sunday = df.loc[df["weekday"] == 6].sort_values(by="RTA_freq", ascending=False)
    
    # Split overall predictions into predictions per weekday
    lst_mon = predicted_rta[0::7]
    lst_tue = predicted_rta[1::7]
    lst_wed = predicted_rta[2::7]
    lst_thu = predicted_rta[3::7]
    lst_fri = predicted_rta[4::7]
    lst_sat = predicted_rta[5::7]
    lst_sun = predicted_rta[6::7]
    
    # The evaluation period 2019-07-01 to 2019-12-31 conveniently starts with a Monday but end with a Tuesday - hence the loop has to run 
    # one iteration more for Monday and Tuesday.
    # This generates a list of lists of predictions for each weekday

    monday_bins = tuesday_bins = wednesday_bins = thursday_bins = friday_bins = saturday_bins = sunday_bins = []
    monday_tw = tuesday_tw = wednesday_tw = thursday_tw = friday_tw = saturday_tw = sunday_tw = []
    
    for i in range(len(lst_mon)):
        monday_bins.append(list(*[df_monday["hex_bins"][0:lst_mon[i]]]))
        monday_tw.append(list(*[df_monday["time_window"][0:lst_mon[i]]]))
        tuesday_bins.append(list(*[df_tuesday["hex_bins"][0:lst_tue[i]]]))
        tuesday_tw.append(list(*[df_tuesday["time_window"][0:lst_tue[i]]]))
    for i in range(len(lst_wed)):
        wednesday_bins.append(list(*[df_wednesday["hex_bins"][0:lst_wed[i]]]))
        wednesday_tw.append(list(*[df_wednesday["time_window"][0:lst_wed[i]]]))
        thursday_bins.append(list(*[df_thursday["hex_bins"][0:lst_thu[i]]]))
        thursday_tw.append(list(*[df_thursday["time_window"][0:lst_thu[i]]]))
        friday_bins.append(list(*[df_friday["hex_bins"][0:lst_fri[i]]]))
        friday_tw.append(list(*[df_friday["time_window"][0:lst_fri[i]]]))
        saturday_bins.append(list(*[df_saturday["hex_bins"][0:lst_sat[i]]]))
        saturday_tw.append(list(*[df_saturday["time_window"][0:lst_sat[i]]]))
        sunday_bins.append(list(*[df_sunday["hex_bins"][0:lst_sun[i]]]))
        sunday_tw.append(list(*[df_sunday["time_window"][0:lst_sun[i]]]))    
    
    
    # Turn list of lists into an overall list for each weekday's predictions
    flat_monday_bins = [item for sublist in monday_bins for item in sublist]
    flat_monday_tw = [item for sublist in monday_tw for item in sublist]
    flat_tuesday_bins = [item for sublist in tuesday_bins for item in sublist]
    flat_tuesday_tw = [item for sublist in tuesday_tw for item in sublist]
    flat_wednesday_bins = [item for sublist in wednesday_bins for item in sublist]
    flat_wednesday_tw = [item for sublist in wednesday_tw for item in sublist]
    flat_thursday_bins = [item for sublist in thursday_bins for item in sublist]
    flat_thursday_tw = [item for sublist in thursday_tw for item in sublist]
    flat_friday_bins = [item for sublist in friday_bins for item in sublist]
    flat_friday_tw = [item for sublist in friday_tw for item in sublist]
    flat_saturday_bins = [item for sublist in saturday_bins for item in sublist]
    flat_saturday_tw = [item for sublist in saturday_tw for item in sublist]
    flat_sunday_bins = [item for sublist in sunday_bins for item in sublist]
    flat_sunday_tw = [item for sublist in sunday_tw for item in sublist]
    
    # Generate list with hex bins and time windows as input for prediction
    flat_bins = flat_monday_bins + flat_tuesday_bins + flat_wednesday_bins + flat_thursday_bins + flat_friday_bins + flat_saturday_bins + flat_sunday_bins
    flat_tw = flat_monday_tw + flat_tuesday_tw + flat_wednesday_tw + flat_thursday_tw + flat_friday_tw + flat_saturday_tw + flat_sunday_tw

    # Generate list with day of the week entries for each prediction as input for dataframe
    weekdays = [0] * sum(lst_mon) + [1] * sum(lst_tue) + [2] * sum(lst_wed) + [3] * sum(lst_thu) + [4] * sum(lst_fri) + [5] * sum(lst_sat) + [6] * sum(lst_sun)
    
    # Generate list with week entries for each prediction as input for dataframe
    list_of_days_list = [lst_mon, lst_tue, lst_wed, lst_thu, lst_fri, lst_sat, lst_sun]
    lst_weeks = []
    for lst_days in list_of_days_list:
        i = 0
        for number in lst_days:
            lst_weeks += [i] * number
            i += 1
    
    # Create dataframe
    df = pd.DataFrame(list(zip(flat_bins, flat_tw, weekdays, lst_weeks)), columns=["hex_bins", "time_window", "weekday", "week"])
    
    return df

In [47]:
predictions_c = generate_predictions(df_cleaned, predicted_rta)

NameError: name 'predicted_rta' is not defined

In [None]:
predictions_c.head()

### Helper function

In [None]:
def reduce_to_time_windows(df):
    """Takes a data frame of predicted RTA's and brings it into the correct format for clustering."""
    
    # Set start of prediction period
    start = pd.to_datetime("2019-07-01")
    
    # Creates a datetime column that counts the days upwards and then sets all entries to the starting day, 2019-07-01, plus that day
    df["help"] = (df["week"]) * 7 + df["weekday"]
    df["datetime"] = df["help"].apply(lambda x: start + pd.Timedelta(days=x))
    
    # Convert time windows strings back to datetime objects and add 1 minute to have them lie inside the time window rather than on the verge
    df.loc[df["time_window"] == "00-03", "datetime"] = df["datetime"] + pd.Timedelta(minutes=1)
    df.loc[df["time_window"] == "03-06", "datetime"] = df["datetime"] + pd.Timedelta(hours=3, minutes=1)
    df.loc[df["time_window"] == "06-09", "datetime"] = df["datetime"] + pd.Timedelta(hours=6, minutes=1)
    df.loc[df["time_window"] == "09-12", "datetime"] = df["datetime"] + pd.Timedelta(hours=9, minutes=1)
    df.loc[df["time_window"] == "12-15", "datetime"] = df["datetime"] + pd.Timedelta(hours=12, minutes=1)
    df.loc[df["time_window"] == "15-18", "datetime"] = df["datetime"] + pd.Timedelta(hours=15, minutes=1)
    df.loc[df["time_window"] == "18-21", "datetime"] = df["datetime"] + pd.Timedelta(hours=18, minutes=1)
    df.loc[df["time_window"] == "21-00", "datetime"] = df["datetime"] + pd.Timedelta(hours=21, minutes=1)
    
    # Remove redundant columns
    df = df.drop(["time_window", "weekday", "week", "help"], axis=1)
    
    return df

In [None]:
predictions_for_clustering_c = reduce_to_time_windows(predictions_c)

In [None]:
predictions_for_clustering_c.head()

### Output to .csv

In [None]:
export_df_to_csv(predictions_for_clustering_c,path_file='../Inputs/predictions_for_clustering_c.csv')

***

## D. Using weather data and traffic speed data to predict RTA occurrence (yes/no?) per time window and hex_bin

Extend C to also include data that is specific per time window and hex bin. This allows the regression model to output a different value for each time window and hex bin (not only hex bin class).

****