In [None]:
#
# Copyright (C) 2024 by Katja Gilly <katya@umh.es>
#
# This code is licensed under a Creative Commons Attribution 4.0 International License. (see LICENSE.txt for details)
#
# General Description - this notebook is used to extract data from SUMO logs in a table format indexed by time and vehicle id.
# It creates three types of output files: 
#    - (STEP 1) a csv file per SUMO log formatted as a table with most atributes and signals per vehicle.
#    - (STEP 2) a csv file that adds the 7 previous geo positions of vehicles in each row. It requires (STEP 1) to be run previously.
#    - (STEP 3) a csv file that adds the 7 past geo positions of vehicles in each row. It requires (STEP 1) to be run previously.
#

In [None]:
def dataset_sumo(vector_sumo, output):

    # parameters
    # 1 = $vector_sumo - sumo input.file name
    # 2 = $output      - output.file name

    # sumo log format:
    #   <timestep time="0.00">
    #     <vehicle id="0" x="-0.482437" y="38.344131" angle="339.66" type="DEFAULT_VEHTYPE" speed="5.10" pos="5.10" lane="23036317#1_0" slope="0.00" signals="0"/>
    !grep -n 'time=' $vector_sumo | tr -d \" > sumo_times.txt
    !grep -n 'vehicle' $vector_sumo | tr -d \" > sumo_veh.txt

    cols=['row','veh_id','x','y','angle','speed','pos','lane','slope','signals']
    data = vx.read_csv("sumo_veh.txt", sep=' ', header=None,
            names=['row','1','2','3','4','5','6','7','8','veh_id','x','y','angle','type','speed','pos','lane','slope','signals'], 
            usecols=cols,
            convert=True, chunk_size=150_000_000)    
    #print(data.shape, data)

    # clean vehicle's file
    !rm sumo_veh.txt 
    
    data['veh_id'] = data['veh_id'].str.replace('id=', '').str.replace('"', '')
    data['x'] = data["x"].str.replace('x=', '')
    data['y'] = data["y"].str.replace('y=', '')
    data['angle'] = data["angle"].str.replace('angle=', '')
    data['speed'] = data["speed"].str.replace('speed=', '')
    data['pos'] = data["pos"].str.replace('pos=', '')
    data['lane'] = data["lane"].str.replace('lane=', '')
    data['slope'] = data["slope"].str.replace('slope=', '')
    data['signals'] = data["signals"].str.replace('signals=', '').str.replace('/>', '') 
    data['row'] = data["row"].str.replace(':', '').astype('int')
           
    data_t = pd.read_csv("sumo_times.txt", sep=' ', header=None,
            names=['row','1','2','3','4','time'], 
            usecols=['row','time'])
    data_t['time'] = data_t['time'].str.replace('time=', '').str.replace('>', '').astype('float')
    data_t['row'] = data_t['row'].str.replace(':', '').astype('int')  
    # check
    #print(data_t,data_t.shape,data_t.row.dtype, data_t.time.dtype)
    data_du = data["veh_id"].nunique()
    print('\n(1) unique:',data_du)

    # clean times file
    !rm sumo_times.txt

    # difference of <timestep time=... rows
    # 3:  1 row of <vehicle id= ...
    # 4:  2 rows of <vehicle id= ...
    # ....
    # 90: 88 rows of <vehicle id= ...
    data_t['row_diff'] = data_t.row.diff()
    print(data_t, data.shape)    
    
    # build the dictionary
    d = {}
     
    # get times
    for idx, row in data_t.iterrows():
        i = row.row_diff - 2
        row_ant = data_t.iloc[idx-1].row; time_ant = data_t.iloc[idx-1].time
        row_this = data_t.iloc[idx].row;  time = data_t.iloc[idx].time
        while (i > 0):
            d[int(row_ant + i)] = time_ant
            i = i - 1
        # if last row
        if idx == (data_t.shape[0]-1):
            i = row.row_diff - 2
            while (i > 0):
                d[int(row_this + i)] = time
                i = i - 1
        # a way to save memory, otherwise it overflows
        gap = 50000
        if (idx % gap == gap-1):
            label = str(int(idx/gap))
            data['t' + label] = data.row.map(d, default_value = 0.0)    
            d = {}
        
    # get the rest of 'time' values
    label = str(int(idx/gap))
    data['t' + label]  = data.row.map(d, default_value = 0.0)      

    # merge time columns (max 4 columns for 1800s)
    data['t'] = data.t0 + data.t1 + data.t2 + data.t3        
    
    data = data[['t','veh_id','x','y','angle','speed','pos','lane','slope','signals']]
    data_du = data["veh_id"].nunique()
    print(data, '\n(2) unique:',data_du)

    data.export_csv(output, index=False, sep='\t')
    
    del data_t, data
    
    return 1 

In [3]:
#with pandas: for files smaller than 1 GB (and faster than vaex)
def process_dataset_sumo_future(s_input, output):
    df_pd = pd.read_csv(s_input, sep='\t', header=None,
            names=['t','veh_id','x','y','angle','speed','pos','lane','slope','signals']) 
            #header=1)
    print(df_pd.head())
    print(df_pd['veh_id'])
    df_pd=df_pd.tail(-1)

    
    # with pandas: it is ok if input file is smaller than 1 GB
    df_pd['x1'] = df_pd.groupby('veh_id')['x'].shift(-100).fillna(999999)
    df_pd['y1'] = df_pd.groupby('veh_id')['y'].shift(-100).fillna(999999)
    df_pd['x2'] = df_pd.groupby('veh_id')['x'].shift(-200).fillna(999999)
    df_pd['y2'] = df_pd.groupby('veh_id')['y'].shift(-200).fillna(999999)
    df_pd['x3'] = df_pd.groupby('veh_id')['x'].shift(-300).fillna(999999)
    df_pd['y3'] = df_pd.groupby('veh_id')['y'].shift(-300).fillna(999999)
    df_pd['x4'] = df_pd.groupby('veh_id')['x'].shift(-400).fillna(999999)
    df_pd['y4'] = df_pd.groupby('veh_id')['y'].shift(-400).fillna(999999)
    df_pd['x5'] = df_pd.groupby('veh_id')['x'].shift(-500).fillna(999999)
    df_pd['y5'] = df_pd.groupby('veh_id')['y'].shift(-500).fillna(999999)
    df_pd['x6'] = df_pd.groupby('veh_id')['x'].shift(-600).fillna(999999)
    df_pd['y6'] = df_pd.groupby('veh_id')['y'].shift(-600).fillna(999999)
    df_pd['x7'] = df_pd.groupby('veh_id')['x'].shift(-700).fillna(999999)
    df_pd['y7'] = df_pd.groupby('veh_id')['y'].shift(-700).fillna(999999)
    print(df_pd.shape, df_pd)

    df_pd.to_csv(output, index=False, sep='\t')

    return 1 


In [4]:
#with pandas: for files smaller than 1 GB (and faster than vaex)
def process_dataset_sumo_past(s_input, output):
    df_pd = pd.read_csv(s_input, sep='\t', header=None,
            names=['t','veh_id','x','y','angle','speed','pos','lane','slope','signals']) 
            #header=1)
    print(df_pd.shape)
    print(df_pd['veh_id'])
    df_pd=df_pd.tail(-1)
    print(df_pd.head())
    
    # with pandas: it is ok if input file is smaller than 1 GB
    df_pd['x-1'] = df_pd.groupby('veh_id')['x'].shift(100).fillna(999999)
    df_pd['y-1'] = df_pd.groupby('veh_id')['y'].shift(100).fillna(999999)
    df_pd['x-2'] = df_pd.groupby('veh_id')['x'].shift(200).fillna(999999)
    df_pd['y-2'] = df_pd.groupby('veh_id')['y'].shift(200).fillna(999999)
    df_pd['x-3'] = df_pd.groupby('veh_id')['x'].shift(300).fillna(999999)
    df_pd['y-3'] = df_pd.groupby('veh_id')['y'].shift(300).fillna(999999)
    df_pd['x-4'] = df_pd.groupby('veh_id')['x'].shift(400).fillna(999999)
    df_pd['y-4'] = df_pd.groupby('veh_id')['y'].shift(400).fillna(999999)
    df_pd['x-5'] = df_pd.groupby('veh_id')['x'].shift(500).fillna(999999)
    df_pd['y-5'] = df_pd.groupby('veh_id')['y'].shift(500).fillna(999999)
    df_pd['x-6'] = df_pd.groupby('veh_id')['x'].shift(600).fillna(999999)
    df_pd['y-6'] = df_pd.groupby('veh_id')['y'].shift(600).fillna(999999)
    df_pd['x-7'] = df_pd.groupby('veh_id')['x'].shift(700).fillna(999999)
    df_pd['y-7'] = df_pd.groupby('veh_id')['y'].shift(700).fillna(999999)
    print(df_pd.shape, df_pd)

    df_pd.to_csv(output, index=False, sep='\t')

    return 1 


In [5]:
# measuring execution time
%load_ext autotime

# extract the delay and the handover information from omnet output vector file
maxTime = 1800
communities = 9

#parametrised calls of notebook
import numpy as np
import pandas as pd
import vaex as vx
import os.path

path="/home/jupyter/notebook/OMNET6.0/"
sumo_files = "/dataset_AI_input/fdc_signals_"
#cars = np.array([4928, 4951, 4955, 5712, 5734, 5749, 6900, 6908, 6923, 8589, 8619, 8620])
cars = np.array([4928])
out = "dataset_AI_output/"
outFile = "_AI.csv"
# initialPositioning-xxxx.txt and migrations-xxxx.txt will also be.ipynb_checkpoints/created

for i in cars:
    # sumo datasets are already created (based on fdc files)
    # info about cars positioning and parameters over time
    # obtained from sumo with commands:
    # sumo -c Alicante_8620.sumo.cfg --fcd-output.geo true --fcd-output.signals true --fcd-output ../fdc_signals_8620.xml --end 1800
    v_sumo = path + sumo_files + str(i) +  ".xml"
    output = out + str(i) + "_sumo" + outFile
    print("STEP 1) sumo input : ", v_sumo)
    print("STEP 1) sumo output : ", output)
    # get sumo dataset
    df_sumo_exit_code = dataset_sumo(v_sumo, output)

    # process sumo dataset to get future positions
    #v_sumo = output
    #output = out + str(i) + "_sumo_1" + outFile
    #print("STEP 2) sumo input : ", v_sumo)
    #print("STEP 2) sumo output : ", output)
    #df_sumo_exit_code = process_dataset_sumo_future(v_sumo, output)
    # check
    #print(df_sumo_exit_code)
    
    # process sumo dataset to get past positions
    #v_sumo = output
    #output = out + str(i) + "_sumo_2" + outFile
    #print("STEP 3) sumo input : ", v_sumo)
    #print("STEP 3) sumo output : ", output)
    #df_sumo_exit_code = process_dataset_sumo_past(v_sumo, output)
    # check
    #print(df_sumo_exit_code)

STEP 1) sumo input :  /home/jupyter/notebook/OMNET6.0//dataset_AI_input/fdc_signals_4928.xml
STEP 1) sumo output :  dataset_AI_output/4928_sumo_AI.csv

(1) unique: 887
             row     time  row_diff
0             38     0.00       NaN
1             41     0.01       3.0
2             44     0.02       3.0
3             47     0.03       3.0
4             50     0.04       3.0
...          ...      ...       ...
179995  10231732  1799.95      90.0
179996  10231822  1799.96      90.0
179997  10231912  1799.97      90.0
179998  10232002  1799.98      90.0
179999  10232092  1799.99      90.0

[180000 rows x 3 columns] (9872144, 10)
#          t        veh_id    x          y          angle    speed    pos    lane             slope    signals
0          0.0      0         -0.482437  38.344131  339.66   0.00     5.10   23036317#1_0     0.00     0
1          0.01     0         -0.482437  38.344131  339.66   0.03     5.10   23036317#1_0     0.00     8
2          0.02     0         -0.48243