In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import json
import gpytorch
import torch
import tqdm
import os
import threading
import concurrent
import logging

from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import StepLR
from matplotlib import pyplot as plt
from time import sleep
from typing import List, Union
from math import ceil


%matplotlib inline
%load_ext autoreload
%autoreload 2


In [2]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [3]:
def flatten_dict(x: dict, prefix=""):
    y = {}
    for k, v in x.items():
        if isinstance(v, dict):
            y.update(flatten_dict(v, k + "_"))
        else:
            y[prefix + k] = v
    return y

[raw data shape] (14980, 15)
[data shape after thinning] torch.Size([749, 14])


In [14]:
class ReturnValueThread(threading.Thread):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.result = None

    def run(self):
        if self._target is None:
            return  # could alternatively raise an exception, depends on the use case
        try:
            self.result = self._target(*self._args, **self._kwargs)
        except Exception as exc:
            print(f'{type(exc).__name__}: {exc}', file=sys.stderr)  # properly handle the exception

    def join(self, *args, **kwargs):
        super().join(*args, **kwargs)
        return self.result
    
def preprocess(data_paths, include_dirs, max_workers: int = 64):
        
    all_tasks = []
    num_tasks = len(include_dirs)
        
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        for path in data_paths:
            for folder in os.listdir(path):
                all_tasks.append(executor.submit(subprocess, path, folder))
            for future in tqdm.tqdm(concurrent.futures.as_completed(all_tasks), total=num_tasks):
                try:
                    data = future.result()
                        
                except Exception as e:
                    logging.error(e)

def subprocess(path, folder):   

    if int(folder) not in include_dirs:
        return

    chassis_file = os.path.join(path, folder, "env_learning_zip/chassis.txt")
    control_file = os.path.join(path, folder, "env_learning_zip/control.txt")
    localization_file = os.path.join(path, folder, "env_learning_zip/localization.txt")
    
    a= num_lines(chassis_file)
    b= num_lines(control_file)
    c= num_lines(localization_file)

    if a != b or b != c or c != a or a == 0:
        return

    to_csv(chassis_file, os.path.join(path, folder, "preprocessed/chassis.csv"))
    to_csv(control_file, os.path.join(path, folder, "preprocessed/control.csv"))
    to_csv(localization_file, os.path.join(path, folder, "preprocessed/localization.csv"))


    
def to_csv(file, dest):
    df = pd.DataFrame()
    with open(file) as file:
        for line in file:
            nested_dict = json.loads(line)
            df = pd.concat([df, pd.DataFrame.from_dict(flatten_dict(nested_dict), orient='columns')], axis=0)
    df.to_csv(dest)

def num_lines(abs_path):
    num = 0
    if not os.path.exists(abs_path):
        return num

    with open(abs_path) as file:
        for line in file:
            num += 1

    return num

    

tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.], dtype=torch.float64)


In [None]:
class DataLoader():
    def __init__(
        self,
        data_paths,
        include_dirs=None,
        batch_size=1000
    ):
        self.data_paths = data_paths
        self.chassis_df = pd.DataFrame()
        self.control_df = pd.DataFrame()
        self.localization_df = pd.DataFrame()
        self.include_dirs = include_dirs
        self.batch_size=batch_size
        
    def load_data(self):
        batch_index = 0
        tmp1 = []
        tmp2 = []
        tmp3 = []
        tmp4 = []
        
        with tqdm.notebook.tqdm(total=len(self.include_dirs)) as pbar:
            pbar.set_description('Processing:')
            for p in self.data_paths:
                for folder in os.listdir(p):
                    pbar.update(1)
                    if folder not in self.include_dirs:
                        continue

                    chassis_file = os.path.join(p, folder, "env_learning_zip/chassis.txt")
                    control_file = os.path.join(p, folder, "env_learning_zip/control.txt")
                    localization_file = os.path.join(p, folder, "env_learning_zip/localization.txt")

                    if not os.path.exists(chassis_file) or \
                    not os.path.exists(control_file) or \
                    not os.path.exists(localization_file):
                        continue


                    a = num_lines(chassis_file)
                    b = num_lines(control_file)
                    c = num_lines(localization_file)

                    if a != b or b != c or a == 0:
                        continue

                    with open(chassis_file) as file:
                        for line in file:
                            tmp1.append(flatten_dict(json.loads(line)))

                    with open(control_file) as file:
                        for line in file:
                            tmp2.append(flatten_dict(json.loads(line)))

                    with open(localization_file) as file:
                        for line in file:
                            tmp3.append(flatten_dict(json.loads(line)))

                    if a < self.batch_size:

                        tmp4.append(np.ones(a) * batch_index)
                        batch_index += 1
                        continue

                    num_batches = ceil(a / self.batch_size)

                    for i in range(num_batches - 1):
                        tmp4.append(np.ones(self.batch_size) * batch_index)
                        batch_index += 1

                    tmp4.append(np.ones(a - (num_batches - 1) * self.batch_size) * batch_index)
                    batch_index += 1

        tmp4 = np.concatenate(tmp4, axis=0)
    
        self.chassis_df = pd.concat([pd.DataFrame(tmp1), pd.DataFrame(tmp4, columns=["batch"])], axis=1)
        self.control_df = pd.concat([pd.DataFrame(tmp2), pd.DataFrame(tmp4, columns=["batch"])], axis=1)        
        self.localization_df = pd.concat([pd.DataFrame(tmp3), pd.DataFrame(tmp4, columns=["batch"])], axis=1)
        return self.chassis_df.reset_index(drop=True), self.control_df.reset_index(drop=True), self.localization_df.reset_index(drop=True)

In [4]:
def RemoveUnwantedData(x: pd.DataFrame):
    index = x["drivingMode"] == "COMPLETE_AUTO_DRIVE"
    x = x[index]
    return x, index

def AddRelativeTime(x: pd.DataFrame):
    if "dt" in x.columns:
        return x
    
    if "timestamp" not in x.columns:
        raise Exception("Not timestamp")
        
    num_data_point = x.shape[0]
        
    relative_time = x.iloc[1:num_data_point]["timestamp"].to_numpy() - x.iloc[0:num_data_point-1]["timestamp"].to_numpy()
    relative_time = np.append(relative_time, 0.0)   
    
    x['dt'] = relative_time
    return x

def TimeSeriesData(raw_data: pd.DataFrame, state_columns: List[str], observ_columns: List[str]):
    
    data = None
    if "dt" not in raw_data.columns:
        data = AddRelativeTime(raw_data)
    else:
        data = raw_data
        
    num_data_point = data.shape[0]
    
    
    
    if(any(entity not in data.columns for entity in state_columns)):
        raise Exception("Unknown state")
    
    if(any(entity not in data.columns for entity in observ_columns)):
        raise Exception("Unknown observation")
    
    x_index = np.logical_and([data["dt"] < 0.5], [data["dt"] > 0.0]).flatten()
    y_index = np.append(x_index, x_index[-1])[1:].flatten()
        
    X = data[x_index][state_columns + ["dt"]]
    Y = data[y_index][observ_columns]

        
    return X, Y, x_index, y_index

def NormalizeData(x: pd.DataFrame, shifter: Union[None, np.array] = None, normalizer: Union[None, np.array] = None, has_batch=True):
    X = x
    if has_batch:
        X = x[x.columns[x.columns != "batch"]]
        batch = x["batch"]
    
    if shifter is None:
        shifter = X.min()
        
    X = X - shifter
    
    if normalizer is None:
        normalizer = X.max()

    X = 2 * (X / normalizer) - 1
    
    if has_batch:
        return pd.concat([X.clip(-1, 1), batch], axis=1), shifter, normalizer
    else:
        return X.clip(-1, 1), shifter, normalizer

def DeNormalizeData(x: np.array, shifter: np.array, normalizer: np.array):
    return (x + 1.0) / 2.0 * normalizer + shifter

In [5]:
username = os.getenv("USERNAME")

task = 1876652

data_paths= [f"/home/{username}/workspace/dataset_downloader/{task}/processed"]

include_dirs=[
5798514803372,  5798515131052,  5798515442348,  5798515778220,  5798516081324,  5798516392620,
5798514811564,  5798515139244,  5798515450540,  5798515786412,  5798516089516,  5798516400812,
5798514827948,  5798515147436,  5798515458732,  5798515794604,  5798516097708,  5798516409004,
5798514836140,  5798515155628,  5798515466924,  5798515802796,  5798516105900,  5798516417196,
5798514844332,  5798515172012,  5798515475116,  5798515810988,  5798516114092,  5798516425388,
5798514852524,  5798515180204,  5798515483308,  5798515819180,  5798516122284,  5798516433580,
5798514860716,  5798515188396,  5798515491500,  5798515827372,  5798516130476,  5798516441772,
5798514877100,  5798515196588,  5798515499692,  5798515835564,  5798516138668,  5798516449964,
5798514885292,  5798515204780,  5798515507884,  5798515843756,  5798516146860,  5798516458156,
5798514893484,  5798515212972,  5798515524268,  5798515851948,  5798516155052,  5798516466348,
5798514909868,  5798515221164,  5798515532460,  5798515860140,  5798516163244,  5798516474540,
5798514918060,  5798515229356,  5798515540652,  5798515868332,  5798516171436,  5798516482732,
5798514926252,  5798515237548,  5798515548844,  5798515876524,  5798516179628,  5798694355628,
]

backup_dirs =[
5798514934444,  5798515245740,  5798515557036,  5798515884716,  5798516187820,  5798694363820,
5798514950828,  5798515253932,  5798515565228,  5798515892908,  5798516196012,  5798694478508,
5798514959020,  5798515262124,  5798515581612,  5798515901100,  5798516204204,  5798694486700,
5798514967212,  5798515270316,  5798515589804,  5798515909292,  5798516212396,  5798694494892,
5798514975404,  5798515278508,  5798515597996,  5798515917484,  5798516220588,  5798694503084,
5798514983596,  5798515286700,  5798515606188,  5798515925676,  5798516228780,  5798694511276,
5798514999980,  5798515294892,  5798515614380,  5798515933868,  5798516236972,  5798694519468,
5798515008172,  5798515303084,  5798515638956,  5798515942060,  5798516245164,  5798694527660,
5798515016364,  5798515311276,  5798515655340,  5798515950252,  5798516261548,  5798694535852,
5798515024556,  5798515319468,  5798515663532,  5798515958444,  5798516269740,  5798694544044,
5798515032748,  5798515327660,  5798515671724,  5798515966636,  5798516277932,  5798694552236,
5798515040940,  5798515335852,  5798515688108,  5798515974828,  5798516286124,  5798694560428,
5798515049132,  5798515344044,  5798515696300,  5798515983020,  5798516302508,  5798694568620,
5798515057324,  5798515352236,  5798515704492,  5798515991212,  5798516310700,  5798694576812,
5798515065516,  5798515360428,  5798515712684,  5798515999404,  5798516318892,  5798694585004,
5798515073708,  5798515368620,  5798515720876,  5798516007596,  5798516327084,  5798694617772,
5798515081900,  5798515385004,  5798515729068,  5798516015788,  5798516335276,  5798694625964,
5798515090092,  5798515393196,  5798515737260,  5798516023980,  5798516343468,  5798694634156,
5798515098284,  5798515401388,  5798515745452,  5798516032172,  5798516351660,  5798694642348,
5798515106476,  5798515409580,  5798515753644,  5798516048556,  5798516368044,
5798515114668,  5798515417772,  5798515761836,  5798516056748,  5798516376236,
5798515122860,  5798515434156,  5798515770028,  5798516064940,  5798516384428    
]

include_dirs = set(include_dirs)

include_dirs = [str(x) for x in include_dirs]

LSTMEncoderDecoder(
  (lstm_encoder): LSTMFeatureExtractor(
    (rnn): LSTM(14, 64, num_layers=2, dropout=0.2, bidirectional=True)
  )
  (lstm_decoder): LSTMFeatureExtractor(
    (rnn): LSTM(128, 64, proj_size=14, num_layers=2, dropout=0.2)
  )
)


In [6]:
dl = DataLoader(data_paths, include_dirs)

In [7]:
if os.path.exists(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/chassis_training.csv') and \
os.path.exists(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/control_training.csv') and \
os.path.exists(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/localization_training.csv'):
    chassis_df = pd.read_csv(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/chassis_training.csv')
    control_df = pd.read_csv(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/control_training.csv')
    localization_df = pd.read_csv(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/localization_training.csv')

else:
    chassis_df, control_df, localization_df = dl.load_data()

    os.makedirs(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/', exist_ok=True)  
    chassis_df.to_csv(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/chassis_training.csv')
    control_df.to_csv(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/control_training.csv')
    localization_df.to_csv(f'/home/{username}/workspace/dataset_downloader/{task}/processed/preprocessed/localization_training.csv')

Epoch:   0%|          | 0/1000 [00:00<?, ?it/s]

In [12]:
chassis_df

tensor([[-0.2235, -0.0534, -0.0919, -0.2298, -0.0076, -0.1813, -0.5084, -0.0755,
         -0.1404, -0.2393, -0.0631, -0.3437, -0.1596, -0.1141],
        [-0.2046, -0.0358, -0.0195, -0.2164,  0.1551, -0.1185, -0.5436, -0.0204,
         -0.1387, -0.2142, -0.0543, -0.3537, -0.1565, -0.0977],
        [-0.1383,  0.0428,  0.0341, -0.0823,  0.2592,  0.0322, -0.4231,  0.0606,
         -0.0928, -0.1328, -0.0286, -0.3320, -0.0992, -0.0806],
        [-0.2175, -0.0401, -0.0439, -0.2083,  0.0901, -0.0782, -0.5182,  0.0093,
         -0.0894, -0.1775, -0.0429, -0.3433, -0.1478, -0.0808],
        [-0.2256, -0.0243, -0.0279, -0.2059,  0.1067, -0.1033, -0.5236,  0.0020,
         -0.0871, -0.2036, -0.0162, -0.3664, -0.1454, -0.1068]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
tensor([[ 4.2850e-01,  4.5847e-01,  3.2603e-01,  3.8882e-01, -4.7573e-01,
         -6.6099e-01, -8.7429e-02, -1.0739e+00, -3.5041e-01,  1.5334e-01,
         -1.6457e+00,  3.3230e-01,  2.2536e-01,  2.9109e-01],
        [-2.

In [None]:
chassis_features = ['engineRpm', 
                    'speedMps', 
                    'driveMotorTorqueNm']

control_features = ['throttle', 
                    'brake']

localization_features = ["linearVelocityVrf_y", 
                         "linearAccelerationVrf_y"]

In [None]:
chassis_df = chassis_df.reset_index(drop=True)
control_df = control_df.reset_index(drop=True)
localization_df = localization_df.reset_index(drop=True)

In [None]:
print(chassis_df.shape)
print(control_df.shape)
print(localization_df.shape)

In [None]:
chassis_df_v, valid_index = RemoveUnwantedData(chassis_df)

In [None]:
print(valid_index.shape)

In [None]:
chassis_df_v = chassis_df_v.reset_index()
control_df_v  = control_df[valid_index].reset_index()
localization_df_v = localization_df[valid_index].reset_index()

In [None]:
train_x_batch = pd.DataFrame()
train_y_batch = pd.DataFrame()

In [None]:
num_batches = chassis_df["batch"].unique().shape[0]

In [None]:
for b in range(int(num_batches)):
    chassis_b = chassis_df_v[chassis_df_v["batch"] == b].reset_index(drop=True)
    localization_b = localization_df_v[localization_df_v["batch"] == b].reset_index(drop=True)
    control_b = control_df_v[control_df_v["batch"] == b].reset_index(drop=True)
    
    num_samples = chassis_b.shape[0] - 1
    
    train_x = pd.concat(
        [chassis_b.loc[0:num_samples-1, chassis_features].reset_index(drop=True), 
         localization_b.loc[0:num_samples-1, localization_features].reset_index(drop=True), 
         localization_b.loc[1:num_samples, localization_features].reset_index(drop=True)], 
        ignore_index=True,
        axis=1)
    
    train_x["batch"] = b

    train_y = pd.DataFrame(control_b.loc[0:num_samples-1, control_features[0]] - control_b.loc[0:num_samples-1, control_features[1]])
    
    train_y["batch"] = b
    print(train_y)
    
    
    train_x_batch = pd.concat([train_x_batch, train_x], axis=0, ignore_index=True)
    train_y_batch = pd.concat([train_y_batch, train_y], axis=0, ignore_index=True)    

In [None]:
train_x_batch

In [None]:
x_shifter = np.array([0.000000, 0.000000, -70.000000, -0.3, -5, -0.3, -5])

x_normalizer = np.array([4300.0, 24.0, 255.000000, 22.5, 10, 22.5, 10])

y_shifter = np.array([-60])

y_normalizer = np.array([120])

In [None]:
train_x_n, x_shifter, x_normalizer = NormalizeData(train_x_batch, x_shifter, x_normalizer)
train_y_n, y_shifter, y_normalizer = NormalizeData(train_y_batch, y_shifter, y_normalizer)

In [None]:
if len(train_y_n.shape) == 1:
    train_y_n = train_y_n.reshape(train_y_n.shape[0], -1)