# Preprocessing

## Read CSV files

In [1]:
import pandas as pd
import numpy as np
import math
import datetime

###########################################
# Configuation
###########################################

# Assume laneId provided in preceeding data processor
lane_id_col = 'lane_id'
columns_tracks = ['frame', 'trackId', 'xCenter', 'yCenter', 'length', 'width',
                'xVelocity', 'yVelocity', 'xAcceleration', 'yAcceleration',
                'frontSightDistance', 'backSightDistance', 'dhw', 'thw', 'ttc',
                'precedingXVelocity', 'precedingId', 'followingId', 'leftPrecedingId',
                'leftAlongsideId', 'leftFollowingId', 'rightPrecedingId',
                'rightAlongsideId', 'rightFollowingId', 'laneId', 'angle',
                'orientation', 'yaw_rate ', 'ego_offset']
columns_tracks_meta = ['trackId', 'length', 'width', 'initialFrame', 'finalFrame', 'numFrames',
       'class', 'drivingDirection', 'traveledDistance', 'minXVelocity',
       'maxXVelocity', 'meanXVelocity', 'minDHW', 'minTHW', 'minTTC',
       'numLaneChanges'] 
columns_recording_meta = ['recordingId', 'frameRate', 'locationId', 'speedLimit', 'month',
       'weekDay', 'startTime', 'duration', 'totalDrivenDistance',
       'totalDrivenTime', 'numVehicles', 'numCars', 'numTrucks', 'numBuses',
       'laneMarkings', 'scale']

# def parse_timestamp(timestamp_str):
#     """
#     自定义解析函数，处理纳秒数据
#     """

#     # 转换为秒
#     seconds_timestamp = np.int64(timestamp_str) / 10**9

#     # 转换为datetime对象
#     dt = datetime.datetime.fromtimestamp(seconds_timestamp)

#     # 格式为年-月-日 时:分:秒.纳秒
#     return pd.to_datetime(dt, format='%Y-%m-%d %H:%M:%S.%f')  


# 读取多个CSV文件
# df_ego = pd.read_csv("ego.csv", index_col='ts', parse_dates=['ts'], date_format=parse_timestamp)
# df_obj = pd.read_csv("obj.csv", index_col='ts', parse_dates=['ts'], date_format=parse_timestamp)
df_ego = pd.read_csv("ego.csv")
df_obj = pd.read_csv("obj.csv")
# ... 其他传感器文件

# df_ego 加入 obj_id 字段 取 1
ego_obj_id = 1
df_ego['obj_id'] = ego_obj_id 

# NOTE: Assume we have laneId 
# add column 'laneId' to both ego and obj if needed
if lane_id_col not in df_ego.columns:
    df_ego[lane_id_col] = 0

if lane_id_col not in df_obj.columns:
    df_obj[lane_id_col] = 0

In [2]:
print(df_ego['obj_id'].isnull().any())
print(df_obj['obj_id'].isnull().any())

False
False


 [NoUse]Check obj_id of merge dataframe

In [3]:
# Processing nano seconds timestamp

# 合并DataFrame
# df_merged = pd.concat([df_ego, df_obj], axis=0, ignore_index=True)

# 处理缺失值（例如填充为0）
# df_merged.fillna(0, inplace=True)

# check obj_id
# print(df_merged['obj_id'].head)
# print(f"if any obj_id is NaN:{df_merged['obj_id'].isnull().any()}\n")
# print(f"Number of rows : {len(df_merged)}")
# print(f"Number of unique obj_id: {df_merged['obj_id'].nunique()}")

# print(f"Number of unique ts: {df_merged['ts'].nunique()}")

# print(f"Number of unique ts in obj: {df_obj['ts'].nunique()}")
# print(f"Number of unique ts in ego: {df_ego['ts'].nunique()}")

# 查看合并后的DataFrame

# 

# Data Interpolation in ego

## Data timestamp arrange 

add obj's unique ts to ego, then interpolate new ts in ego from existing ts

Since we need ego state in all ts of obj, we need to estimate the state data of ego in all obj ts.

In [None]:
ts_obj = df_obj['ts'].unique()
ts_ego = df_ego['ts'].unique()
ts_obj_not_ego = [ts for ts in ts_obj if ts not in ts_ego]

print(f"unique ts in obj: {len(ts_obj)}")
print(f"unique ts in ego: {len(ts_ego)}")
print(f"unique ts in obj_not_ego: {len(ts_obj_not_ego)}")

# augment ego with ts in obj
df_obj_ts = pd.DataFrame(np.array(ts_obj_not_ego), columns=['ts'])

# concat with df_ego
df_ego_augment = pd.concat([df_ego, df_obj_ts], axis=0, ignore_index=True)

# sort by ts and reindex
df_ego_augment.sort_values(by='ts', inplace=True)
df_ego_augment.reset_index(drop=True, inplace=True)

# update obj_id of new rows
df_ego_augment['obj_id'] = ego_obj_id 

# store middle data to csv
# df_ego_augment.to_csv('ego_augment.csv', index=False)


unique ts in obj: 299
unique ts in ego: 998
unique ts in obj_not_ego: 298


## Interpolation of ego in obj ts


In [5]:
# Interpolation for x, y values

# 1 create new index with ts
df_ego_augment['timestamp'] = pd.to_datetime(df_ego_augment['ts'])
df_ego_augment.set_index('timestamp', inplace=True)

df_ego_augment.to_csv('ego_augment.csv', index=False)
# df_ego_augment.to_csv('ego_augment.csv', index=True)

# 2 interpolate columns with time method by index
interpolate_columns = ['x','y','z','h','spd_mps', 'spd_kph', 'acc_lgt_mpss', 'acc_lat_mpss']

# create a new dataframe but share underline value for memory optimization
df_ego_interpolated = df_ego_augment.copy(deep=False)

for col in interpolate_columns:
    df_ego_interpolated[col] = df_ego_interpolated[col].interpolate(method='time')

df_ego_interpolated.to_csv('ego_augment_interpolated.csv', index=False)


## Concat df_ego_interpolated and df_obj to df_obj_augment
here df_obj_augment has all obj_id (obj and ego), with each obj's ts has ego position interpolated from previous processing

In [6]:

# concat df_ego_augment to df_obj for late obj axis transform to ground axis
df_obj_augment = pd.concat([df_obj, df_ego_interpolated], axis=0)

# sort all vehicles by ts-timestamp value
df_obj_augment.sort_values(by='ts', inplace=True)

# sort dataframe for reviewing
df_obj_augment.to_csv('obj_augment.csv', index=False)

# Obj Track Computation from ego data

In [7]:
# set ego vel_lgt_mps, vel_lat_mps
df_obj_augment.loc[df_obj_augment['obj_id'] == ego_obj_id, 'vel_lgt_mps'] = df_obj_augment['spd_mps']
df_obj_augment.loc[df_obj_augment['obj_id'] == ego_obj_id, 'vel_lat_mps'] = 0


## Traverse obj_augment to axis transformation

update obj's x and y position according to ego's x, y

In [8]:
def ensure_columns_exist_robust(df, column_names):
    """
    Robust version of ensure_columns_exist, handling potential errors.

    Args:
        df: The Pandas DataFrame.
        column_names: A list of strings representing column names.

    Returns:
        The DataFrame, potentially with new columns added, or None if an error occurs.
        Prints messages indicating actions taken or errors.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input 'df' must be a Pandas DataFrame.")
        return None

    if not column_names:  # Handle empty list of column names
        print("No column names provided. Nothing to do.")
        return df

    if not all(isinstance(col, str) for col in column_names):
        print("Error: All elements in 'column_names' must be strings.")
        return None
    
    try:
        for col_name in column_names:
            if col_name not in df.columns:
                df[col_name] = pd.NA
                print(f"Column '{col_name}' created.")
            else:
                print(f"Column '{col_name}' already exists.")
        return df
    except Exception as e: # Catch potential exceptions during column creation
        print(f"An error occurred: {e}")
        return None

In [9]:
def rotate_vector(x1, y1, angle_radians):
    """
    Rotates a vector from one coordinate system (RF1) to another (RF2).

    Args:
        x1: The x-component of the vector in RF1.
        y1: The y-component of the vector in RF1.
        angle_radians: The angle of rotation from RF1 to RF2, measured counter-clockwise.

    Returns:
        A tuple containing the x and y components of the rotated vector in RF2 (x2, y2).
        Returns None if input is invalid.
    """
    if not isinstance(x1, (int, float)) or not isinstance(y1, (int, float)) or not isinstance(angle_radians, (int, float)):
      print("x1, y1 and angle must be numbers")
      return None

    x2 = x1 * math.cos(angle_radians) - y1 * math.sin(angle_radians)
    y2 = x1 * math.sin(angle_radians) + y1 * math.cos(angle_radians)
    return (x2, y2)

In [10]:
def create_tracks(df):
    """
    Assigns unique track IDs and frame numbers to objects in a DataFrame.

    Args:
        df: DataFrame with 'objectID' and 'timestamp' columns.

    Returns:
        DataFrame with added 'track_ID' and 'frame' columns, or None if input is invalid.
    """
    _objectID = "obj_id"
    _timestamp = "ts"
    _track_ID = "trackId"
    _frame = "frame"

    if not isinstance(df, pd.DataFrame):
        print("Input must be a Pandas DataFrame.")
        return None

    if not {_objectID, _timestamp}.issubset(df.columns):
        print("DataFrame must contain '{}' and '{}' columns.".format(_objectID, _timestamp))
        return None

    df = df.sort_values(by=[_objectID, _timestamp])  # Important: Sort by objectID then timestamp

    track_id_map = {}  # Dictionary to store objectID to track_ID mapping
    current_track_id = 1 # object starts from 1, track record 0 stand for no object later
    df[_track_ID] = -1  # Initialize track_ID column
    df[_frame] = -1       # Initialize frame column

    for index, row in df.iterrows():
        object_id = row[_objectID]

        if object_id not in track_id_map:
            track_id_map[object_id] = current_track_id
            current_track_id += 1

        df.loc[index, _track_ID] = track_id_map[object_id]

    # Calculate frame numbers within each track
    df[_frame] = df.groupby(_track_ID).cumcount()
    return df

In [None]:
# sort by obj_id, make sure ego always comes first, then sort by ts stable

df_obj_augment.sort_values(by='obj_id', inplace=True)
df_obj_augment.sort_values(by='ts', inplace=True, kind='stable')
df_obj_augment.reset_index(drop=True, inplace=True)

# First traverse to calculate the x, y, z, and speed, acceleration respectively.
updated_col = 'updated'
df_obj_augment[updated_col] = False 

curr_obj_idx, curr_ego_idx = None, None 

# print(df_obj_augment.head)
df_obj_augment.to_csv('obj_before_first_trav.csv')

# create all columns_tracks in df_obj_augment if needed
df_obj_augment = ensure_columns_exist_robust(df_obj_augment, columns_tracks)

Column 'frame' created.
Column 'trackId' created.
Column 'xCenter' created.
Column 'yCenter' created.
Column 'length' already exists.
Column 'width' already exists.
Column 'xVelocity' created.
Column 'yVelocity' created.
Column 'xAcceleration' created.
Column 'yAcceleration' created.
Column 'frontSightDistance' created.
Column 'backSightDistance' created.
Column 'dhw' created.
Column 'thw' created.
Column 'ttc' created.
Column 'precedingXVelocity' created.
Column 'precedingId' created.
Column 'followingId' created.
Column 'leftPrecedingId' created.
Column 'leftAlongsideId' created.
Column 'leftFollowingId' created.
Column 'rightPrecedingId' created.
Column 'rightAlongsideId' created.
Column 'rightFollowingId' created.
Column 'laneId' created.
Column 'angle' created.
Column 'orientation' created.
Column 'yaw_rate ' created.
Column 'ego_offset' created.


In [12]:
obj_id_col = 'obj_id'
for idx in range(len(df_obj_augment)-1): 
    # print("current idx: {}".format(idx))

    # obj
    if int(df_obj_augment.loc[idx, obj_id_col]) != ego_obj_id:
        curr_obj_idx = idx

        # check curr_obj_idx and curr_ego_idx timestamp matched
        # print(int(df_obj_augment.loc[idx, obj_id_col]))
        # print("if result {}".format(int(df_obj_augment.loc[idx, obj_id_col]) != ego_obj_id))
        # print("object {} and ego {} timestamp not match".format(curr_obj_idx, curr_ego_idx))
        assert df_obj_augment.loc[curr_obj_idx, 'ts'] == df_obj_augment.loc[curr_ego_idx, 'ts'], "object {} and ego {} timestamp not match".format(curr_obj_idx, curr_ego_idx)

        # NOTE x, y 
        df_obj_augment.loc[curr_obj_idx, 'x'] = df_obj_augment.loc[curr_ego_idx, 'x'] + df_obj_augment.loc[curr_obj_idx, 'lat']
        df_obj_augment.loc[curr_obj_idx, 'y'] = df_obj_augment.loc[curr_ego_idx, 'y'] + df_obj_augment.loc[curr_obj_idx, 'lgt']
        
        # flag column updated
        df_obj_augment.loc[curr_obj_idx, updated_col] = True 

    # ego
    else: 
        # print("update curr_ego_idx: {}".format(idx))
        curr_ego_idx = idx
        curr_ego_h = df_obj_augment.loc[curr_ego_idx, 'h']

        # update ego frame vel_lgt_mps, vel_lat_mps for later computation
        df_obj_augment.loc[curr_ego_idx, 'vel_lgt_mps'] = df_obj_augment.loc[curr_ego_idx, 'spd_mps']
        df_obj_augment.loc[curr_ego_idx, 'vel_lat_mps'] = 0

    # NOTE xCenter, yCenter
    df_obj_augment.loc[idx, 'xCenter'] = df_obj_augment.loc[idx, 'x'] 
    df_obj_augment.loc[idx, 'yCenter'] = df_obj_augment.loc[idx, 'y'] 

    # NOTE xVelocity, yVelocity, xAcceleration, yAcceleration
    # update according to curr_ego_h
    # compute all xVelocity, yVelocity, xAcceleration, yAcceleration from ground-reference-frame
    _xVel, _yVel = rotate_vector(df_obj_augment.loc[idx, 'vel_lgt_mps'], df_obj_augment.loc[idx, 'vel_lat_mps'], curr_ego_h)
    df_obj_augment.loc[idx, 'xVelocity'] = _xVel
    df_obj_augment.loc[idx, 'yVelocity'] = _yVel

    _xAcc, _yAcc = rotate_vector(df_obj_augment.loc[idx, 'acc_lgt_mpss'], df_obj_augment.loc[idx, 'acc_lat_mpss'], curr_ego_h)
    df_obj_augment.loc[idx, 'xAcceleration'] = _xAcc
    df_obj_augment.loc[idx, 'yAcceleration'] = _yAcc

    # angle: cannot calculate w/out lane info
    df_obj_augment.loc[idx, 'angle'] = 0

    # orientation
    df_obj_augment.loc[idx, 'orientation'] = df_obj_augment.loc[idx, 'h'] 

    # yaw_rate ~= yAcc / xVel
    if df_obj_augment.loc[idx, 'xVelocity'] != 0:
        df_obj_augment.loc[idx, 'yaw_rate'] = df_obj_augment.loc[idx, 'yAcceleration'] / df_obj_augment.loc[idx, 'xVelocity']
    else:
        df_obj_augment.loc[idx, 'yaw_rate'] = 0

    # ego_offset: cannot calculate w/out lane width info, set to 0
    df_obj_augment.loc[idx, 'ego_offset'] = 0


## Create Tracks and Save

In [13]:

df_obj_augment.to_csv('obj_first_trav.csv')
df_tracks = create_tracks(df_obj_augment)
df_tracks.to_csv('my_tracks.csv')

# Second traverse to calculate the "nine-positional" detective filter

## 9-box Detector

In [14]:
def check_vehicle_sides(df, current_index, lookahead_factor=2.0):
  """
  Checks for vehicles in 8 directions (front, back, left_following, 
  left_preceding, left_alongside, right_preceding, right_following, right_alongside) 
  based on ts, x, y, orientation, xVelocity, and yVelocity data.

  Args:
    df: pandas DataFrame containing 'ts', 'x', 'y', 'orientation', 'obj_id', 'xVelocity', 'yVelocity' columns.
    current_index: Index of the current vehicle in the DataFrame.
    x_velocity: x-component of the current vehicle's velocity.
    y_velocity: y-component of the current vehicle's velocity.
    lookahead_factor: Factor to adjust lookahead distance based on speed.

  Returns:
    A dictionary with keys for each direction, 
    each containing the obj_id of the nearest vehicle in that direction, 
    or 0 if no vehicle is found.
  """
  current_row = df.iloc[current_index]

  x_velocity        = current_row['xVelocity']
  y_velocity        = current_row['yVelocity']
  current_x         = current_row['x']
  current_y         = current_row['y']
  current_timestamp = current_row['ts'] 

  # Calculate lookahead distance based on speed
  current_speed     = np.sqrt(x_velocity**2 + y_velocity**2)
  lookahead_distance = current_speed * lookahead_factor 

  # Calculate unit vectors for current vehicle's direction
  heading_x = np.cos(np.radians(current_row['orientation']))
  heading_y = np.sin(np.radians(current_row['orientation']))

  # Define unit vectors for all directions
  directions = {
      'front': np.array([heading_x, heading_y]),
      'back': -np.array([heading_x, heading_y]),
      'left_alongside': np.array([-heading_y, heading_x]), 
      'left_preceding': np.array([heading_x-heading_y, heading_x+heading_y]),   # 'front' + 'left_alongside'
      'left_following': np.array([-heading_x-heading_y, heading_x-heading_y]),  # 'back'  + 'left_alongside'
      'right_alongside': np.array([heading_y, -heading_x]),
      'right_preceding': np.array([heading_x+heading_y, -heading_x+heading_y]), # 'front' + 'right_alongside'
      'right_following': np.array([-heading_x+heading_y, -heading_x-heading_y]) # 'back'  + 'right_alongside'
  }

  # Initialize results
  vehicle_ids = {
      'front':            (0, float('inf'), 0),  # might need precedingXVelocity
      'back':             (0, float('inf')),
      'left_preceding':   (0, float('inf')),
      'left_alongside':   (0, float('inf')),
      'left_following':   (0, float('inf')),
      'right_preceding':  (0, float('inf')),
      'right_alongside':  (0, float('inf')),
      'right_following':  (0, float('inf'))
  }

  # Filter DataFrame to consider only vehicles within a reasonable time window
  # (e.g., vehicles within the last few seconds)
  time_window = 10**6  # 1 second,  10**6, unit: microsecond
  filtered_df = df[(df['ts'] >= current_timestamp - time_window) & (df['ts'] <= current_timestamp + time_window)] 

  for index, row in filtered_df.iterrows():
    if index == current_index:
      continue  # Skip the current vehicle

    # Calculate distance and direction vectors
    distance_vector = np.array([row['x'] - current_x, row['y'] - current_y])
    distance = np.linalg.norm(distance_vector)

    # Check if within lookahead distance
    if distance > lookahead_distance:
      continue

    # Normalize direction vector
    direction_vector = distance_vector / distance

    for direction, direction_unit_vector in directions.items():
      # loop through all directions

      dot_product = np.dot(direction_vector, direction_unit_vector)

      if dot_product > 0:
        # on the same direction

        if not vehicle_ids[direction][0] or distance < vehicle_ids[direction][1]:
          # not exist or closer than current obj_id, then udpate

          # front contains xVelocity
          if direction == 'front':
            vehicle_ids[direction] = (row['obj_id'], distance, row['xVelocity'])
            continue

          vehicle_ids[direction] = (row['obj_id'], distance)

  return vehicle_ids
  

In [15]:
def check_surrounding_objects(df):
    """
    Assuming 'df' is your DataFrame with 'ts', 'x', 'y', 'orientation', 'obj_id', 'xVelocity', 'yVelocity' columns
    """
    # for _timestamp, _group_indices in df.groupby('ts').groups.items():
        # _df_timestamp = df.loc[_group_indices] 
        # for _index, _ in _df_timestamp.iterrows():
            # surrounding_results = check_vehicle_sides(_df_timestamp, _index) 
            # surrounding_results = check_vehicle_sides(df, _index) 

    for _index in df.index:
        surrounding_results = check_vehicle_sides(df, _index) 

        # update precedingXVelocity
        df.loc[_index, 'precedingXVelocity'] = surrounding_results['front'][2]

        # update surrounding values
        df.loc[_index, 'precedingId']      =  surrounding_results['front']          [0]
        df.loc[_index, 'followingId']      =  surrounding_results['back']           [0]
        df.loc[_index, 'leftPrecedingId']  =  surrounding_results['left_preceding'] [0]
        df.loc[_index, 'leftAlongsideId']  =  surrounding_results['left_alongside'] [0]
        df.loc[_index, 'leftFollowingId']  =  surrounding_results['left_following'] [0]
        df.loc[_index, 'rightPrecedingId'] =  surrounding_results['right_preceding'][0]
        df.loc[_index, 'rightAlongsideId'] =  surrounding_results['right_alongside'][0]
        df.loc[_index, 'rightFollowingId'] =  surrounding_results['right_following'][0]

In [16]:
# Second Traverse to Calculate 9-box detector
# check_surrounding_objects(df_tracks)
check_surrounding_objects(df_obj_augment)

In [17]:
df_obj_augment.to_csv("nine_box_tracks.csv")

# Construct three output dataframe

In [18]:
# pd_tracks = pd.read_csv('01_tracksMeta.csv')
# print(pd_tracks.columns)

# Construct 3 output dataframes
pd_recording_meta = pd.DataFrame(columns=columns_recording_meta)
pd_tracks_meta = pd.DataFrame(columns=columns_tracks_meta)
pd_tracks = pd.DataFrame(columns=columns_tracks)


## Construct pd_tracks values

In [19]:
# pd_tracks['xCenter'] = df_obj_augment['x']
# pd_tracks['yCenter'] = df_obj_augment['y']
# pd_tracks['length'] = df_obj_augment['length']
# pd_tracks['width'] = df_obj_augment['width']
# pd_tracks['xVelocity']          = pd.NA 
# pd_tracks['yVelocity']          = pd.NA 
# pd_tracks['xAcceleration']      = pd.NA 
# pd_tracks['yAcceleration']      = pd.NA 
# pd_tracks['frontSightDistance'] = pd.NA 
# pd_tracks['backSightDistance']  = pd.NA 
# pd_tracks['dhw']                = pd.NA 
# pd_tracks['thw']                = pd.NA 
# pd_tracks['ttc']                = pd.NA 

# # Nine-box data
# # NOTE: initial to -1 indicate invalid value
# pd_tracks['precedingXVelocity'] = pd.NA 
# pd_tracks['precedingId']        = pd.NA 
# pd_tracks['followingId']        = pd.NA 
# pd_tracks['leftPrecedingId']    = pd.NA 
# pd_tracks['leftAlongsideId']    = pd.NA 
# pd_tracks['leftFollowingId']    = pd.NA 
# pd_tracks['rightPrecedingId']   = pd.NA 
# pd_tracks['rightAlongsideId']   = pd.NA 
# pd_tracks['rightFollowingId']   = pd.NA 

# pd_tracks['laneId']      = df_obj_augment['lane_id']
# pd_tracks['angle']       = df_obj_augment['h']
# pd_tracks['orientation'] = pd.NA 
# pd_tracks['yaw_rate ']   = pd.NA 
# pd_tracks['ego_offset']  = pd.NA 

# # Extra columns
# pd_tracks['roadID'] = df_obj_augment['road_id']

## Construct pd_tracks_meta

In [20]:
pd_tracks_meta

Unnamed: 0,trackId,length,width,initialFrame,finalFrame,numFrames,class,drivingDirection,traveledDistance,minXVelocity,maxXVelocity,meanXVelocity,minDHW,minTHW,minTTC,numLaneChanges


## Construct pd_recording_meta

In [21]:

pd_recording_meta

Unnamed: 0,recordingId,frameRate,locationId,speedLimit,month,weekDay,startTime,duration,totalDrivenDistance,totalDrivenTime,numVehicles,numCars,numTrucks,numBuses,laneMarkings,scale


## Store All three outputs to csv files

In [22]:
pd_tracks.to_csv('tracks_result.csv')
pd_tracks_meta.to_csv('tracks_meta_result.csv')
pd_recording_meta.to_csv('recording_result.csv')