# Automotive Data analyser

- Phase 1: CSV files with the format `ABSOLUTE_TIME, ID, BINARY`
- Phase 2: CSV files with the format `ABSOLUTE_TIME, can_id_var, can_id_var, ...` where `can = {0,1}`, `id` is a unique identifier for the source and `var` is the identifier of the decoded field.

## Contents

- [Initial setup](#Initial-setup)
- [Export figures](#Export-figures)
  - [Distributions chart](#Distributions-chart) 

## Initial Setup

In [None]:
from IPython.display import display
from fastprogress import master_bar, progress_bar
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os
from datetime import datetime
import numpy as np
import math

# If true exports vectorial PDFs instead of JPG.
VECTORIAL_FIGURES = False
FIG_EXTENSION = "pdf" if VECTORIAL_FIGURES else "jpg"

ROOT_DIR = "absolute-path-to-project-root-folder"
VEHICLE = "vechile/experiment"
DATA_DIR = ROOT_DIR + "Data/" + VEHICLE + "/"
GRAPHICS_DIR = ROOT_DIR + "Graphics/" + VEHICLE + "/" + FIG_EXTENSION + "/"

try:
    os.makedirs(GRAPHICS_DIR)
except FileExistsError:
    # directory already exists
    pass

try:
    os.makedirs(GRAPHICS_DIR + "distributions/")
except FileExistsError:
    # directory already exists
    pass

import pandas as pd

sns.set_style("whitegrid")
sns.set_style({'font.family':'monospace'})

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

You may either import the CSV or the pickle file.

In [None]:
%%time
# Load the CSV
df = pd.read_csv(DATA_DIR + "unified.csv", 
                 #sep= r' +|,|\t', # Files may have one or more spaces as separator (also include commas and tabs).
                 sep=",",
                 dtype={0:object, 1:object, 2:object, 3:object, 4:object, 5:int}, 
                 #nrows=1000,
                 #header=None
                )

# Convert timestamp column to datetime index
datetimes = pd.to_datetime(df.time)
idx = pd.DatetimeIndex(datetimes, freq='infer').copy(deep=True)
#df.set_index(idx, inplace=True)
#df.drop('time', axis=1, inplace=True)
df['time'] = idx

# Remove support variables
del(datetimes)
del(idx)

# Convert CAN line and ID columns to categorical data
df['can'] = pd.Categorical(df['can'])
df['id'] = pd.Categorical(df['id'])
df['ms'] = df.time.dt.floor('ms')

xlim = (df.ms.min(), df.ms.max())
#

You may either import the CSV or the pickle file.

In [None]:
%%time
#df = pd.read_pickle(DATA_DIR + "unified.pkl")
#df['ms'] = df.time.dt.floor('ms')

Show some properties of the data files

In [None]:
#df = df.sample(10000)
# Identify the limits of the timeseries to fix a bus in the x_asis
xlim = (df.ms.min(), df.ms.max())
display(df.info())
display(df.can.unique())
display(xlim)
display(df.head())

Show the variables and their values' properties

In [None]:
grps = df.groupby(by=["id","can"])
#uid, can = next(iter(grps.groups))
uid = '416'
can = 'can0'

g = grps.get_group((uid, can))
display(g.info())
display(g.head())

for n, gr in g.groupby('variable'):
    print("-----------------------\n",n)
    display(gr.info())
    display(gr.describe())

## Export figures

### Distributions chart

In [None]:
%%time
def formatter(value, tick_number):
    return "{:.2f}".format(value).rjust(8)

def make_distribution_charts(uid, 
                             can, 
                             group, 
                             xlim_min, 
                             xlim_max, 
                             SAMPLE_FRACTIONS=0.05,
                             SERIES_PER_SUBPLOT=4,
                             RATIOS = [10,2],
                             SUBPLOT_HEIGHT=2,
                             save=False,
                             show=True,
                             showfliers=False
                            ):
    
    def init_figure(uid, can, n_subplots):
        fig = plt.figure(figsize=(15,SUBPLOT_HEIGHT*n_subplots),
                         constrained_layout=True,
                        )
        spec = gridspec.GridSpec(ncols=2, 
                                 nrows=n_subplots, 
                                 figure=fig,
                                 width_ratios = RATIOS,
                                 hspace=1,
                                 wspace=0
                                )
        fig.suptitle("Identifier: " + uid + " (" + can + ")", 
                     y=1, x=0, 
                     ha='left', 
                     fontsize='large', 
                     fontweight='bold')
        
        #print("Figure: ", 15, SUBPLOT_HEIGHT, n_subplots, SUBPLOT_HEIGHT*n_subplots, RATIOS)   
        return fig, spec
    
    def make_palette(group):
        # Create a dictionary palette to ensure that the variables have the same colors
        variables = group.variable.unique()
        # Sorted to maximize colors differences 
        variables.sort()
        palette = dict()
        colors = ["#C44E52", #red
                  "#55A868", #green
                  "#FFC400", #yellow
                  "#4C72B0", #blue
                  "#DD8452", #orange
                  "#8172B3", #purple
                  "#64B5CD", #cyan
                  "#937860", #brown
                  "#8C8C8C", #gray
                 ]
        bases = sns.color_palette(colors, n_colors=len(variables))
        for var in variables:
            palette[var] = bases.pop()
        
        return palette
    
    def make_subplot(subgroup, idx):
        #display(subgroup_full.head())
        #subgroup = subgroup.sample(frac=SAMPLE_FRACTIONS)
        subgroupvariables = subgroup.variable.unique()
        subgroupvariables.sort()
        #grid_split_position = 12 - len(subgroupvariables)
        
        lax = fig.add_subplot(spec[idx, 0])
        lg = sns.lineplot(data=subgroup, 
                          x=subgroup.ms, 
                          y=subgroup.value, 
                          hue=subgroup.variable,
                          hue_order=subgroupvariables,
                          palette=palette,
                          legend='full',
                          sort=True,
                          ax=lax)
        lg.set_xlim(left=xlim_min, right=xlim_max)
        
        ylim_min, ylim_max = lg.get_ylim() 
        ylim_min -= 0.5 * subgroup.value.std() 
        ylim_max += 0.5 * subgroup.value.std() 
        lg.set_ylim(bottom=ylim_min, top=ylim_max)
        
        handles, labels = lg.get_legend_handles_labels()
        lg.set(xlabel="Time", ylabel="Value")
        lg.set_title("Values over time - Sampled " + 
                     '{:.0f}'.format(100*SAMPLE_FRACTIONS) +"% (" + 
                     '{:,.0f}'.format(len(subgroup) / len(subgroupvariables)) + 
                     " of " + 
                     '{:,.0f}'.format(len(subgroup_full) / len(subgroupvariables)) + 
                     ")", 
                     fontdict= {'verticalalignment': 'baseline', 'horizontalalignment': 'center'})
        lg.legend(ncol=1, 
                  loc='upper right', 
                  bbox_to_anchor=(1.11, 1.07),
                  handles=handles[1:],
                  labels=labels[1:],
                  title="Var. Type\n" + str(t)[:10].ljust(10).capitalize()
                 )
        lg.yaxis.set_major_formatter(plt.FuncFormatter(formatter))
        
        
        rax = fig.add_subplot(spec[idx, 1], sharey=lax)
        rg = sns.boxenplot(data=subgroup, 
                         y=subgroup.value, 
                         x=subgroup.variable,
                         palette=palette,
                         #showfliers=showfliers,
                         order=subgroupvariables,
                         ax=rax)
        rg.set(xlabel="Variables", ylabel=None, title="Boxplots")
        rg.yaxis.set_major_formatter(plt.FuncFormatter(formatter))

    
    import matplotlib.ticker as ticker
    import matplotlib.gridspec as gridspec

    type_groups = group.groupby('datatype')
    n_subplots = 0
    for x in type_groups.variable.nunique():
        n_subplots += math.ceil(x / SERIES_PER_SUBPLOT)
    
    fig, spec = init_figure(uid, can, n_subplots)

    #for t, subgroup_full in group.groupby('datatype'):
    #    make_subplot(subgroup_full, idx)        
        
    idx = 0
    for t, subgroup_full in type_groups:            
        
        palette = make_palette(subgroup_full)
        
        start = 0
        while True:
            subset = subgroup_full.variable.unique()[start:start+SERIES_PER_SUBPLOT]
            if (len(subset) == 0):
                break
            subgroup_subset = subgroup_full[subgroup_full['variable'].isin(subset)]
            make_subplot(subgroup_subset.sample(frac=SAMPLE_FRACTIONS), idx)  
            start += SERIES_PER_SUBPLOT
            idx += 1
        
    if save:
        plt.savefig(GRAPHICS_DIR + "distributions/" + can + "-" + uid + ".sampled." + 
                    '{:.0f}'.format(100*SAMPLE_FRACTIONS) + "perc." + FIG_EXTENSION)
    if show:
        plt.show()
        
    plt.close()
    return

#uid = next(iter(df.id.unique()))
#can = df[df['id']==uid].can.unique()[0]
#make_distribution_charts(uid, 
#                         can, 
#                         df.groupby(['id','can']).get_group((uid,can)), 
#                         xlim_min=xlim[0], 
#                         xlim_max=xlim[1],
#                         SAMPLE_FRACTIONS=0.05,
#                         RATIOS=[10,2],
#                         SUBPLOT_HEIGHT=2,
#                         save=False,
#                         show=True
#                        )

Calculate the distributions for each ID/CANline and with different sampling percentage

In [None]:
%%time

show = False

FRACTIONS = [0.05]#, 0.1, 0.5, 0.75, 1.0]
mb = master_bar(df.groupby(['id','can']))
for (uid, can), group in mb:
    pb = progress_bar(FRACTIONS, parent=mb)
    for f in pb:
        mb.comment = 'CAN:' + can + ' - UID: ' + uid
        pb.comment = 'SAMPLING ' + '{:.0f}'.format(100*f) + "%"
        try:
            make_distribution_charts(uid, 
                                     can, 
                                     group, 
                                     xlim_min=xlim[0], 
                                     xlim_max=xlim[1],
                                     SAMPLE_FRACTIONS=f,
                                     save=True,
                                     show=show
                                    )
        except Exception as ex:
            print(uid, can, f, ex)
            break
    