# Bemobile

In [None]:
import pandas as pd
import numpy as np

In [None]:
import itertools
import logging
import math

In [None]:
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)

## Reading in data

In [None]:
# This is a subset of the complete dataset, the original dataset contains 880k rows, which is too much for a normal CPU machine to process.
# You can be provided with more data if your model requires that, and if your machines can handle it.
df = pd.read_pickle('../data/dataframe.pkl')

In [None]:
df.shape

In [None]:
def sincostime(timestamp):
    return [np.cos((timestamp.hour * 60 + timestamp.minute) / 60 / 24 * 2 * 3.141592), np.sin((timestamp.hour * 60 + timestamp.minute) / 60 / 24 * 2 * 3.141592)]

In [None]:
import cv2
def jam_resample_travel_times(travel_times, segments_size=60, minutes_size=90):
    return cv2.resize(travel_times, dsize=(minutes_size, segments_size))

In [None]:
def prepare_data(df, train = True):
        X = []
        y = []

        if train:
            df_range = range(round(df.shape[0] * 0.8)) # 80% Train
        else:
            df_range = range(round(df.shape[0] * 0.8), df.shape[0]) # 20% test
        
        for idx, sample in df.iloc[df_range].iterrows():
            xsample = \
            np.asarray([sample.dayOfWeek, *sincostime(sample.timestamp), sample.weekend, *sample.junctions,\
                        *jam_resample_travel_times(np.asarray(sample.actualTravelSpeeds).astype('float64'))[:,:30].flatten(), \
                        *jam_resample_travel_times(np.asarray(sample.optimalTravelSpeeds).astype('float64'))[:,0].flatten(), \
                        sample.roadClass, sum(sample.lengths)/100000])

            ysample = \
            np.asarray([*jam_resample_travel_times(np.asarray(sample.actualTravelSpeeds).astype('float64'))[:,30:].flatten()])
            

            X.append(xsample)
            y.append(ysample)
            if len(y) % 500 == 0:
                logging.debug(f"Already processed {len(y)} samples!")
        
        logging.debug(f"Converting to Numpy arrays!")
        X = np.asarray(X)
        y = np.asarray(y)
                
        return (X, y)

In [None]:
X_train, y_train = prepare_data(df, train = True)

In [None]:
X_test, y_test = prepare_data(df, train = False)

In [None]:
# The model that we will train is quite special, so we have to split the data a little bit. We have Convolutional data and Sequential data

X_train_conv = np.reshape(X_train[:,7:1867],(-1,60,31,1)) # a Square representation of 60 segments, 30 minutes of "ActualSpeeds" and 1 minute of "OptimalSpeeds"
X_train_val = np.concatenate((X_train[:,:7],X_train[:,1867:]),axis = 1) # The rest of the data
X_test_conv = np.reshape(X_test[:,7:1867],(-1,60,31,1)) # a Square representation of 60 segments, 30 minutes of "ActualSpeeds" and 1 minute of "OptimalSpeeds"
X_test_val = np.concatenate((X_test[:,:7],X_test[:,1867:]),axis = 1) # The rest of the data

In [None]:
# This is a subset of the dataframe which you will use to test your model and use the test results
df_test = df.iloc[range(round(df.shape[0] * 0.8), df.shape[0])]

In [None]:
df_test.to_pickle('../data/train-test/test-dataframe.pkl')

In [None]:
# np.save('../data/train-test/X_train.npy', X_train)
np.save('../data/train-test/y_train.npy', y_train)
# np.save('../data/train-test/X_test.npy', X_test)
np.save('../data/train-test/y_test.npy', y_test)

In [None]:
np.save('../data/train-test/X_train_conv.npy', X_train_conv)
np.save('../data/train-test/X_train_val.npy', X_train_val)
np.save('../data/train-test/X_test_conv.npy', X_test_conv)
np.save('../data/train-test/X_test_val.npy', X_test_val)

## Upload this dataset to the cloud

## Make sure this file is executable in one run