In [1]:
#
# Copyright (C) 2021 by Sonja Filiposka <sonja.filiposka@finki.ukim.mk>
#
# This code is licensed under a Creative Commons Attribution 4.0 International License. (see LICENSE.txt for details)
#
# General Description - this notebook is used to extract the delay and the handover information from OMNET output vector file
# It creates two types of output files: 
#    - mobile network communication delay as reported by OMNET
#      - a csv file with the delay for each communication exchange
#    - initial positioning and migration files that are used as input for CloudSim 
#       - initial positioning file defines the start and end time for each car/service and the initial community 
#         based on location of tha car and the location of the base stations (see comments in code)
#       - migration file defines the time stamp when a car moves from one community to another based on relative location to the nearest base station
#


def delay(vector, output):

    # parameters
    # 1 = $vector - input.file name
    # 2 = $output - output.file name

    
    # First prepare the output file from OMNET, vector-0.vec
    # new output file: table.txt

    # add only the lines that start with a number
    !grep "^[0-9]" $vector > table.txt
    
    #import vaex as vx
    #import numpy as np

    data = vx.read_csv("table.txt", sep='\t', header=None,
            names=["vector", "event", "time", "delay"], 
            usecols=["vector","time","delay"],
            convert=True,
            chunk_size=150_000_000)

    #clean up
    !rm table.txt

    # also need another file with vector -> car mapping

    !grep "].lteNic.rlc.um rlcDelayDl:vector" $vector > cars.vec

    !grep -Eo '[0-9]+([^0-9]+[0-9]+)' cars.vec > cars.v
    !sed -e "s/NRSeveralBSALC.car\[//g" < cars.v > cars.txt

    #clean up
    !rm cars.v*

    # this produces a two column file with vector car mapping information, to be used as dictionary

    #build the dictionary
    d = {}
    with open("cars.txt") as f:
        for line in f:
            (key, val) = line.split()
            d[int(key)] = int(val)

    #clean up
    !rm cars.txt

    # make the car vector mapping, if no entry in dictionary -1 will be put
    data['car'] = data['vector'].map(d, default_value = -1 )

    # need to drop vectors that are not in dictionary
    filtered = data[data['car']!=-1]

    #drop the vector column we don't need it any more
    filtered = filtered.drop('vector')

    # save to output file
    df = filtered.to_pandas_df()
    cols = ['car','time','delay']
    df = df[cols]
    df.to_csv(output, index=False)
    
    # now to handover

    # goal: to extract the handover information from the omnet output file and use it to create the input files for cloudsim

    # it is in a statistic named "servingCell" that can be collected in a vector fashion

    !grep servingCell:vector $vector > cell.txt
    
    # cell.txt contains rows in format vector XXX NRSeveralBSALC.car[YYY].lteNic.phy servingCell:vector ETV    
    # extract the numbers
    !grep -Eo '[0-9]+([^0-9]+[0-9]+)' cell.txt > cell.v
    !sed -e "s/NRSeveralBSALC.car\[//g" < cell.v > cell.txt

    #clean up
    !rm cell.v*

    # this produces a two column file with vector car mapping information, to be used as dictionary
    
    # build the handover dictionary
    hd = {}
    with open("cell.txt") as f:
        for line in f:
            (key, val) = line.split()
            hd[int(key)] = int(val)
        
    #clean up
    !rm cell.txt

    # make the car vector mapping, if no entry in dictionary -1 will be put
    data['car'] = data['vector'].map(hd, default_value = -1 )

    # need to drop vectors that are not in dictionary
    handovers = data[data['car']!=-1]
    # save memory
    del data
    !rm table.txt.hdf5

    #drop the vector column we don't need it any more
    handovers = handovers.drop('vector')

    #rename delay column to cell column
    handovers.rename('delay', 'cell')

    # base stations - communities mapping 
    # B1: Luceros - 0
    # B2: Gabriel Miró - 1
    # B3: Teatro Arniches - 2
    # B4: Plaza del Mercado - 3
    # B5: Paseo Canalejas - 4
    # B6: Parque de La Ereta - 5
    # B7: Castillo Santa Bárbara - 6
    # B8: Playa Postiguet - 7
    # B9: Zona Volvo - 8

    # 2 input files for CloudSim

    # initialPositioning.txt
    # example
    # # simulation time	 nodes	 communities
    # 9999 8692 9
    # # VM-CSid	 start_time	 end_time	 com_id
    # 1-0	1.00	36	1

    # VM == car

    # initialPositioning
    # for each car get first base station, first time stamp in handovers, last time stamp from dfH dataset
    min_df = handovers
    min_dfg = min_df.groupby("car").agg({'time': 'min'})
    min_df = min_df.join(min_dfg, on='car', rsuffix='_min')
    min_df = min_df[min_df['time'] == min_df['time_min']].drop(['time_min', 'car_min'])
    min_df = min_df.sort(by=['car'])
    del min_dfg
    
    # for max time must search the previous data frame with the delay logs and find the last occurences
    max_df = filtered
    max_dfg = filtered.groupby("car").agg({'time': 'max'})
    
    # save memory
    del filtered 

    
    max_df = max_df.join(max_dfg, on='car', rsuffix='_max')
    max_df = max_df[max_df['time'] == max_df['time_max']].drop(['time_max', 'car_max'])
    del max_dfg

    # turns out that there might be duplicates in the dataset (two packets received with the same max timestamp)
    # need to drop duplicate rows for cars
    # drop_duplicates works only for pandas df
    dfmax = max_df.to_pandas_df()
    dfmax = dfmax.drop_duplicates(subset=['car'])
    max_df = vx.from_pandas(dfmax)
    del dfmax

    max_df = max_df.sort(by=['car'])
    max_df.rename('time', 'end_time')
    max_df.rename('delay', 'end_delay')
    max_df.rename('car', 'end_car')

    #combine the two data sets
    # car start end cell

    initialPos = min_df.join(max_df, left_on='car', right_on='end_car')
    # save memory
    del max_df
    
    # there might be no delay vectors for a given car, in that case set end_time to maxTime
    initialPos = initialPos.fillna(value=maxTime, column_names=['end_time'])

    # drop
    initialPos = initialPos.drop('end_delay')
    initialPos = initialPos.drop('end_car')

    # community = BS - 1   #CS = car + 1
    initialPos['cell'] = initialPos.cell - 1
    initialPos['cell'] = initialPos.cell.astype('int')
    initialPos['car'] = initialPos.car + 1
    initialPos['car'] = initialPos.car.astype('str')
    initialPos['car'] = initialPos.car + '-0'

    initialPos.rename('car', 'VM-CSid')
    initialPos.rename('time', 'start_time')
    initialPos.rename('cell', 'com_id')


    # create initialPositioning file
    # save to output file
    dfIP = initialPos.to_pandas_df()
    #save memory 
    del initialPos

    # before printing to file check for concurrent events
    # cloudsim can not work if there are two inits at the same time
    # if such a thing occurs add +0.01 to the other event

    dupl = dfIP.groupby('start_time').cumcount()
#    print(dupl[dupl>0])
    dfIP['start_time'] = dfIP['start_time'].where(dupl.eq(0), dfIP['start_time'] + dupl*0.01)
    del dupl
    
    cols = ['VM_CSid','start_time','end_time','com_id']
    dfIP = dfIP[cols]
    dfIP.to_csv('IP.txt', index=False, sep='\t')

    # add two rows in the begining to conform to format
    # # simulation time	 nodes	 communities
    # 9999 8692 9

    IPFile = "initialPositioning-" + str(i) + '.txt'

    !echo "# simulation time	nodes	communities" > $IPFile
    maxCars = dfIP.__len__()
    !echo "$maxTime $maxCars $communities" >> $IPFile

    ! cat IP.txt >> $IPFile

    #cleanup
    !rm IP.txt
    del dfIP
    
    # migrations file

    # migrations.txt
    # example
    # # simulation time	 nodes	 communities
    # 9999 8692 9
    # # time stamp	 node id	 community id
    # 20	3-0	0

    # need to work with handovers data set but drop the first occurence because it is in initialPoisitioning
    migrations = pd.concat([handovers.to_pandas_df(), min_df.to_pandas_df(), min_df.to_pandas_df()]).drop_duplicates(keep=False)

    # save memory 
    del handovers
    del min_df
    
    # sort migrations by time
    migrations = migrations.sort_values(by=['time'])

    # create migrations file
    # save to output file
    migrations['car'] = migrations.car + 1
    migrations['car'] = migrations.car.astype('str')
    migrations['car'] = migrations.car + '-0'
    migrations['cell'] = migrations.cell - 1
    migrations['cell'] = migrations.cell.astype('int')
    migrations.rename(columns = {'cell': 'com_id'}, inplace = True)
    migrations.rename(columns = {'car': 'node_id'}, inplace = True)
    cols = ['time','node_id','com_id']
    migrations = migrations[cols]
    migrations.to_csv('M.txt', index=False, sep='\t')

    # add two rows in the begining to conform to format
    # # simulation time	 nodes	 communities
    # 9999 8692 9

    MFile = "migrations-" + str(i) + '.txt'

    !echo "# simulation time	nodes	communities" > $MFile
    !echo "$maxTime $maxCars $communities" >> $MFile

    ! cat M.txt >> $MFile

    #cleanup
    !rm M.txt    
    
    
    
    return df


In [2]:
# measuring execution time
%load_ext autotime

# extract the delay and the handover information from omnet output vector file
maxTime = 9999
communities = 9

#parametrised calls of notebook

# VideoDL-Urban-4949; VideoDL-Urban-4951; VideoDL-Urban-4955; 
# VideoDL-Urban-5712; VideoDL-Urban-5734; 
# VideoDL-Urban-6900; VideoDL-Urban-6908; VideoDL-Urban-6923;
# VideoDL-Urban-8589; VideoDL-Urban-8595; VideoDL-Urban-8630

import numpy as np
import vaex as vx
import pandas as pd

#text="/home/ubuntu/omnetpp-5.6.2/samples/UrbanALC/VideoDL-Urban-"
text="/mnt/data-disk/OMNET_logs/VideoDL-Urban-"
# cars = np.array([4928, 4951, 4955, 5712, 5734, 5749, 6900, 6908, 6923, 8589, 8595, 8630])
cars = np.array([6910])
outFile = ".csv"
# initialPositioning-xxxx.txt and migrations-xxxx.txt will also be created

for i in cars:
    path = text + str(i)
    vector = path + "/vector-0.vec"
    output = str(i) + outFile
    print(vector)
    df = delay(vector, output)
    #check
    print(df)


/mnt/data-disk/OMNET_logs/VideoDL-Urban-6910/vector-0.vec


INFO:MainThread:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:MainThread:numexpr.utils:NumExpr defaulting to 8 threads.


           car      time     delay
0            0     6.147  0.001297
1            0     6.414  0.001269
2            0     6.821  0.001305
3            0     7.181  0.003149
4            0     7.415  0.003079
...        ...       ...       ...
30073185  6037  9999.961  0.001462
30073186  6037  9999.971  0.001235
30073187  6037  9999.982  0.001970
30073188  6037  9999.992  0.001960
30073189  6038  9999.852  0.001619

[30073190 rows x 3 columns]
