In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [72]:
def clean_dataset_reservations(df, y_coords, x_coords):
    df['startRentalCallSuccessfulTime']=pd.to_datetime(df['startRentalCallSuccessfulTime'])

    # Assign points
    df['col_start'] = np.searchsorted(y_coords, df['startLat'])
    df['row_start'] = np.searchsorted(x_coords, df['startLon'])
    df['col_end'] = np.searchsorted(y_coords, df['endLat'])
    df['row_end'] = np.searchsorted(x_coords, df['endLon'])

    # Sort data for algorithm
    df.sort_values(by=['startRentalCallSuccessfulTime'], inplace=True)
    # Drop NAN
    df = df[df['startRentalCallSuccessfulTime'].notna()]
    return df

In [73]:
smallgrid_x = np.array([12.51763999 , 12.59104667])
smallgrid_y = np.array([55.666106999 , 55.71602183])

In [74]:
# Read data
path_local_file= '2020FullDataset.csv'
df_trip = pd.read_csv(path_local_file)
df_trip = df_trip[
    (df_trip.startLat > smallgrid_y[0]) &
    (df_trip.startLat < smallgrid_y[1]) &
    (df_trip.startLon > smallgrid_x[0]) &
    (df_trip.startLon < smallgrid_x[1]) & (df_trip.trip==1)
]


In [75]:
# Decide on grid size
VERTICAL_LINES=6
HORIZONTAL_LINES=6
x_coords = np.linspace(df_trip.startLon.min(),df_trip.startLon.max(),VERTICAL_LINES)
y_coords = np.linspace(df_trip.startLat.min(),df_trip.startLat.max(),HORIZONTAL_LINES)

# Filter data
df_trip = clean_dataset_reservations(df_trip,y_coords=y_coords,x_coords=x_coords)
#Error fixing
df_trip[df_trip['col_start']==0]=1
df_trip[df_trip['row_start']==0]=1

In [88]:
df_trip=df_trip[df_trip.startRentalCallSuccessfulTime.apply(lambda x: type(x)!=int)]

# Convert to zero index


In [94]:
#df_trip.col_start=df_trip.col_start-1
#df_trip.row_start=df_trip.row_start-1

In [96]:
delta_min =lambda x: int((max(x.startRentalCallSuccessfulTime)-min(x.startRentalCallSuccessfulTime)).seconds/60)
# 80,15,5 partitioning of data intor train test and validation
p_tr=0.8
p_te=0.15
p_va=1-p_tr-p_te

df_tr=df_trip.iloc[0:int(p_tr*len(df_trip))]
df_te=df_trip.iloc[int(p_tr*len(df_trip)):int((1-p_va)*len(df_trip))]
df_va=df_trip.iloc[int((1-p_va)*len(df_trip)):len(df_trip)]

WRITE_TO_FILE=True
prefix="greenmobility_demand_prediction_CPH_SUB"

if WRITE_TO_FILE:
    df_tr.to_csv(""+prefix+"_train.csv")
    df_te.to_csv(""+prefix+"_test.csv")
    df_va.to_csv(""+prefix+"_validation.csv")



print(f"Elements in train partition: {len(df_tr)} - Having {delta_min(df_tr)} minutes of data\n\
Elements in test partition: {len(df_te)} - Having {delta_min(df_te)} minutes of data\n\
Elements in validation partition: {len(df_va)} - Having {delta_min(df_va)} minutes of data\n")

Elements in train partition: 139112 - Having 641 minutes of data
Elements in test partition: 26083 - Having 662 minutes of data
Elements in validation partition: 8695 - Having 280 minutes of data



In [90]:
df_trip

Unnamed: 0,date,customerIDHash,reservationIDHash,startLat,startLon,startRentalCallSuccessful,startRentalCallSuccessfulTime,startStreet,startCity,endLat,...,endLatUser,endLonUser,trip,distanceKM,tripDuration,preReservationDuration,col_start,row_start,col_end,row_end
91905,2020-01-02,0x7F4F2DEF5D97B7A790C5A281482AEDEEC10B57C8,0x1ADEF86910CC73672EECC9B9E4964D05D4887951,55.669880,12.555843,1.0,2020-01-01 23:51:52,2020-01-01 23:34:42.000,København,55.667446,...,55.6674,12.5516,1,1.5,12.0,0.0,1,3,1,3
91906,2020-01-02,0xFEEFAEAD598B8C75B947584BBAFD1E9C4FBE5D39,0xCA0B149A8C0768E810D54D3CF443223F83238BEA,55.681793,12.576095,1.0,2020-01-01 23:53:25,2020-01-01 23:40:12.000,København,55.654346,...,55.6543,12.5403,1,5.3,11.0,0.0,2,4,0,2
95014,2020-01-02,0x7F4F2DEF5D97B7A790C5A281482AEDEEC10B57C8,0xE55DEDC1368E890BDF6737263967AA95CE872EBD,55.666882,12.551415,1.0,2020-01-02 00:04:52,2020-01-02 00:04:51.000,København,55.764675,...,55.7646,12.5863,1,13.2,26.0,0.0,1,3,6,5
95015,2020-01-02,0xBCD6D0B860CD790DE05550735E5CA0B72ECA303E,0xDB8B49EE419A80540862862542934AD5F3379F35,55.706360,12.589775,1.0,2020-01-02 00:15:05,2020-01-02 00:12:37.000,København,55.673000,...,55.6729,12.5518,1,5.4,22.0,0.0,5,5,1,3
101258,2020-01-02,0xF056EE056A31BACFC03D6D3CFA8284970A175C01,0xC433A03D6A0411B9CF0C25A56C11A9CE1E9985A5,55.683422,12.584660,1.0,2020-01-02 00:25:08,2020-01-02 00:05:25.000,København,55.741295,...,55.7413,12.5254,1,8.4,15.0,0.0,2,5,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2020-11-23,0x6594A14B35B99F8B6E138D371D8164851DADC219,0x1A4BC15925AF2E1ED5E412A394C5CE79F11328D6,55.709984,12.575068,1.0,2020-11-22 23:49:51,2020-11-22 23:49:48.000,Copenhagen,55.785778,...,55.7860,12.5238,1,10.7,17.0,0.0,5,4,6,1
91901,2020-11-23,0xF3985C984B72B14E2D7A700C66686377B6DD9639,0x3BF5C3E66074C598BF6B72499176AADC749676CB,55.705929,12.551225,1.0,2020-11-23 00:10:48,2020-11-23 00:10:45.000,Copenhagen,55.706707,...,55.7074,12.5464,1,12.8,90.0,0.0,4,3,5,2
362,2020-11-23,0xE425A4A37C0FAC4EA2A16D47D06B7F5C285DF6E4,0x06AB16DF4329748BC1458269FCF93FD678C98B9C,55.706913,12.588692,1.0,2020-11-23 00:11:09,2020-11-22 23:55:31.000,København,55.693214,...,55.7069,12.5885,1,1.8,5.0,0.0,5,5,3,5
91516,2020-11-23,0x6C58422854C380CC3A28BDE55B29CC901F4FDA3C,0xFEBC94128103BDFFA3C89464BA3BB2DDBD6E529F,55.701382,12.589615,1.0,2020-11-23 00:48:04,2020-11-23 00:41:13.000,Copenhagen,55.647823,...,55.6478,12.6424,1,10.9,27.0,0.0,4,5,0,6
