# Sustainable Mobility: classification of electric scooter rides

## Get Data

In [1]:
!git clone https://github.com/DPaletti/mida_acv
!mv mida_acv/data .
!yes|rm -r mida_acv

Cloning into 'mida_acv'...
remote: Enumerating objects: 249, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 249 (delta 6), reused 46 (delta 2), pack-reused 195[K
Receiving objects: 100% (249/249), 250.21 MiB | 26.30 MiB/s, done.
Resolving deltas: 100% (23/23), done.
Checking out files: 100% (162/162), done.


## Dependencies

In [2]:
!pip install tsfresh
!pip install rdp
!pip install plotly
!pip install joblib
!pip install sktime

Collecting tsfresh
[?25l  Downloading https://files.pythonhosted.org/packages/22/7f/53e845c3e19078d15e228db642ad06d5a91207a66115cb4f30a2eca28f17/tsfresh-0.18.0-py2.py3-none-any.whl (94kB)
[K     |████████████████████████████████| 102kB 4.5MB/s 
Collecting distributed>=2.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/0c/b0/3454dc44239c526f9c9e4cf04f62823776b71f927db74302986d56e7a9a1/distributed-2021.4.0-py3-none-any.whl (684kB)
[K     |████████████████████████████████| 686kB 11.0MB/s 
[?25hCollecting stumpy>=1.7.2
[?25l  Downloading https://files.pythonhosted.org/packages/4c/da/8d372a1af518930ecb3ad9acc627115450149b613ba1b9b51b4d3721218e/stumpy-1.8.0-py3-none-any.whl (94kB)
[K     |████████████████████████████████| 102kB 9.3MB/s 
Collecting matrixprofile>=1.1.10<2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/b1/c3/43d282f0e5299f977b62e53e4dde22ddb14c90877af5b62af225fa783d8e/matrixprofile-1.1.10-cp37-cp37m-manylinux2010_x86_64.whl (1.1MB)
[K   

Collecting rdp
  Downloading https://files.pythonhosted.org/packages/67/42/80a54cc4387256335c32b48bd42db80967ab5f40d6ffcd8167b3dd988c11/rdp-0.8.tar.gz
Building wheels for collected packages: rdp
  Building wheel for rdp (setup.py) ... [?25l[?25hdone
  Created wheel for rdp: filename=rdp-0.8-cp37-none-any.whl size=4569 sha256=d7a5b95ca54277be3d2d3f893c75942f024e2b3a3d55949cfb572c81dcfcbd8d
  Stored in directory: /root/.cache/pip/wheels/76/e4/02/c738593caece49c63180d093651bec3cd3b02ea3248f076f07
Successfully built rdp
Installing collected packages: rdp
Successfully installed rdp-0.8
Collecting sktime
[?25l  Downloading https://files.pythonhosted.org/packages/ed/0b/ee4c2a9f2ef22eea4e202c4740142f3dfb8a3e5f9f1b36731b39b58ca432/sktime-0.6.0-cp37-cp37m-manylinux2014_x86_64.whl (5.7MB)
[K     |████████████████████████████████| 5.7MB 4.5MB/s 
Collecting statsmodels>=0.12.1
[?25l  Downloading https://files.pythonhosted.org/packages/da/69/8eef30a6237c54f3c0b524140e2975f4b1eea3489b45eb3339574

In [3]:
# After installing tsfresh runtime needs to be restarted
exit()

## Imports

In [1]:
from typing import Tuple, List, Dict, Optional
from tempfile import mkdtemp
from pathlib import Path
import multiprocessing
import pandas as pd
import tsfresh as ts
import sklearn as sk
import scipy as sp
import numpy as np
import plotly as plt
import rdp
import plotly.graph_objects as go
import joblib
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.linear_model
#import sktime as skt
#import sktime.forecasting.model_selection
import scipy.signal
import tsfresh.feature_extraction
from itertools import repeat
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
plt.io.renderers.default = 'iframe'

## Definitions

### Read data

In [7]:
def read_data(data_path: str):
  X = pd.DataFrame()
  y_weight=pd.Series(dtype=np.float64)
  y_passengers=pd.Series(dtype=np.int64)
  y_weight_deck=pd.Series(dtype=np.float64)
  y_passengers_deck=pd.Series(dtype=np.int64)
  y_weight_stem=pd.Series(dtype=np.float64)
  y_passengers_stem=pd.Series(dtype=np.int64)
  curr_X = pd.DataFrame()
  i: int = 0
  for placement in {"deck", "stem"}:
      for driver_number in {"single", "double"}:
          for ds in Path(data_path).joinpath(placement, driver_number).iterdir():
              curr_X = pd.read_csv(str(ds))
              curr_X = curr_X.assign(id=i)
              curr_X = curr_X.assign(full_id=placement + "-" + driver_number + "-" + curr_X["Driver"][0])
              curr_X = curr_X.assign(IsDeck=0 if placement == "deck" else 1)
              y_weight.at[i] = curr_X["Weight"][0]
              y_passengers.at[i] = 0 if driver_number=="single" else 1
              if placement == "deck":
                y_weight_deck.at[i] = curr_X["Weight"][0]
                y_passengers_deck.at[i] = 0 if driver_number=="single" else 1
              else:
                y_weight_stem.at[i] = curr_X["Weight"][0]
                y_passengers_stem.at[i] = 0 if driver_number=="single" else 1
                

              curr_X = curr_X.drop(
                  ["Unnamed: 0", "Driver", "Placement", "Weight"], 
                  axis=1
              )
              X = X.append(curr_X)
              i += 1
  to_weight_class =(lambda x: 0 if x<70 else (1 if (x>=70 and x<90) else (2 if (x>=90 and x<110) else 3)))
  return (X.fillna(0),
          X.fillna(0).groupby("id").filter(lambda group: group["full_id"][0].split("-")[0] == "deck").drop("IsDeck", axis=1), 
          X.fillna(0).groupby("id").filter(lambda group: group["full_id"][0].split("-")[0] == "stem").drop("IsDeck", axis=1), 
          y_weight.fillna(0).map(to_weight_class), 
          y_passengers.fillna(0),
          y_weight_deck.fillna(0).map(to_weight_class),
          y_passengers_deck.fillna(0),
          y_weight_stem.fillna(0).map(to_weight_class),
          y_passengers_stem.fillna(0))


### Align signals

In [5]:
def align_signal(signal_1: np.array, signal_2: np.array, col_name:str):
  signal_1 = (signal_1 - np.mean(signal_1))/np.std(signal_1)
  signal_2 = (signal_2 - np.mean(signal_2))/np.std(signal_2)
  shift = np.argmax(np.convolve(signal_1[::-1],signal_2,mode='valid'))
  return np.append([0]*shift, signal_2[shift: ]), col_name

def align(X: pd.DataFrame):
  out_X = pd.DataFrame(columns=X.columns)
  columns_to_ignore = ["full_id", "IsDeck", "Timestamp", "id", "Latitude", "Longitude", "Speed", "Confidence"]
  for ts in [x for _, x in X.groupby("id").filter(lambda group: group["full_id"][0].split("-")[0]=="deck").groupby("id")]:
    full_id = ts["full_id"][0].split("-")
    for ts_to_align in [x for _, x in X.groupby("id").filter(lambda group: group["full_id"][0].split("-")[0]=="stem").groupby("id")]:
      full_id_to_align = ts_to_align["full_id"][0].split("-")
      if full_id[1:] == full_id_to_align[1:]:
        ts_signals = ts.drop(columns_to_ignore, axis=1)
        ts_to_align_signals = ts_to_align.drop(columns_to_ignore, axis=1)
        with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
          aligned_signals = (p.starmap(align_signal, 
                            [(ts_item[1], ts_to_align_item[1], ts_to_align_item[0])
                            for ts_item, ts_to_align_item 
                            in zip(ts_signals.iteritems(), ts_to_align_signals.iteritems())]))
        temp_ts = pd.DataFrame(columns=ts_to_align.columns)
        temp_ts[columns_to_ignore] = ts_to_align[columns_to_ignore]
        
        for aligned_signal, col in aligned_signals:
          temp_ts[col] = aligned_signal
        out_X = out_X.append(ts)
        out_X = out_X.append(temp_ts)
  return out_X,


### Path simplification

In [6]:
def get_path(X: pd.DataFrame) -> np.array:
    out = []
    for index, row in X.iterrows():
        out.append((row["Latitude"], row["Longitude"]))
    return np.array(out)

def simplify_path(X: pd.DataFrame,epsilon: float = 1e-6):
    if epsilon <= 0:
      return X
    out_df: pd.DataFrame = pd.DataFrame()
    for df in [x for _, x in X.groupby(["id"])]:
      print("Simplifying: " + df["full_id"][0])
      df = df[rdp.rdp(get_path(df), epsilon=epsilon, return_mask=True)]
      out_df = out_df.append(df)
    return out_df

### Windowing

In [12]:
def window(X: pd.DataFrame,
           y_weight: pd.Series,
           y_passengers: pd.Series,
           rolling_direction: int=1,
           min_timeshift: int=0,
           max_timeshift: Optional[int] = None):

    X_rolled = ts.utilities.dataframe_functions.roll_time_series(
            X,
            column_id="id",
            column_sort="Timestamp",
            column_kind=None,
            min_timeshift=min_timeshift,
            max_timeshift=max_timeshift,
            rolling_direction=rolling_direction,
            n_jobs=multiprocessing.cpu_count(),
        )

    y_weight_out = np.empty([0])
    y_passengers_out = np.empty([0])

    for df in [x for _, x in X_rolled.groupby(["id"])]:
        ident = df["id"].values[0][0]
        y_weight_out = np.append(y_weight_out, y_weight[ident])
        y_passengers_out = np.append(y_passengers_out, y_passenger[ident])
    return X_rolled, y_weight_out, y_passengers_out




### Feature Extraction

In [16]:
def extract_features(X: pd.DataFrame):
  return ts.extract_features(
          X,
          column_id="id",
          column_sort="Timestamp",
          n_jobs=multiprocessing.cpu_count(),
          default_fc_parameters=ts.feature_extraction.MinimalFCParameters()
      )

### Feature Selection

In [17]:
def select_features(X: pd.DataFrame, y: np.array):
  return ts.select_features(X, y)


### Visualization

In [10]:
def visualize_signal(
    df: pd.DataFrame, signal: str, unit: str, label, *args: Tuple[pd.DataFrame, str]
):
    fig = go.Figure(
        go.Scatter(
            mode="markers+lines",
            x=df["Timestamp"],
            y=df[signal if signal not in {"A", "G", "Jerk_"} else signal + "x"],
            marker={"size": 3},
            name=label,
        )
    )

    fig.update_layout(
        xaxis={"title": "Time (s)"},
        yaxis={"title": signal.replace("_", "") + " (" + unit + ")"},
        width=1920,
        height=1080,
        font=dict(size=18),
        template="plotly_white",
    )

    if signal not in {"A", "G", "Jerk_"}:
        for _df, _label in args:
            fig.add_trace(
                go.Scatter(
                    mode="markers+lines",
                    x=_df["Timestamp"],
                    y=_df[signal],
                    marker={"size": 3},
                    name=_label,
                )
            )
    else:
        dfs: List[pd.DataFrame] = []
        labels: List[str] = []
        for t in args:
            dfs.append(t[0])
            labels.append(t[1])
        for _df, _label, axis in zip(dfs, labels, ["y", "z"]):
            fig.add_trace(
                go.Scatter(
                    mode="markers+lines",
                    x=_df["Timestamp"],
                    y=_df["Speed" if signal == "Speed" else signal + axis],
                    marker={"size": 3},
                    name=_label,
                )
            )
    return fig

## Analysis

### Data loading

In [8]:
X, X_deck, X_stem, y_weight, y_passenger, y_weight_deck, y_passengers_deck, y_weight_stem, y_passengers_stem = read_data("./data/simplified_datasets_1e-06")
#X_aligned = align(X)

In [9]:
print(y_weight)

0     1
1     3
2     2
3     1
4     0
5     1
6     0
7     1
8     1
9     1
10    1
11    0
12    1
13    2
14    3
15    1
16    1
17    0
18    2
19    1
20    0
21    0
22    3
23    1
24    2
25    0
26    1
27    0
28    1
29    0
30    2
31    1
32    1
33    1
34    0
35    1
36    1
37    3
38    1
39    0
40    2
41    1
42    1
43    0
dtype: int64


### Path simplification

In [10]:
X_deck_simplified = simplify_path(X_deck)

NameError: name 'simplify_path' is not defined

## Windowing

In [21]:
X_deck_windowed, y_weight_deck_windowed, y_passengers_deck_windowed = window(X_deck, y_weight_deck, y_passengers_deck, rolling_direction=1, min_timeshift=20,max_timeshift=20)
X_stem_windowed, y_weight_stem_windowed, y_passengers_stem_windowed = window(X_stem, y_weight_stem, y_passengers_stem, rolling_direction=1, min_timeshift=20,max_timeshift=20)


Your time stamps are not uniformly sampled, which makes rolling nonsensical in some domains.

Rolling: 100%|██████████| 38/38 [00:01<00:00, 19.05it/s]
Rolling: 100%|██████████| 39/39 [00:02<00:00, 17.70it/s]


### Feature Extraction

In [22]:
deck_features = extract_features(X_deck_windowed.drop("full_id", axis=1))
stem_features = extract_features(X_stem_windowed.drop("full_id", axis=1))
deck_features.to_csv("features_deck_minimal.csv")
stem_features.to_csv("features_stem_minimal.csv")

Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.40it/s]
Feature Extraction: 100%|██████████| 40/40 [00:13<00:00,  3.07it/s]


7185

In [25]:
weight_stem_features = select_features(stem_features, y_weight_stem_windowed)
weight_stem_features.to_csv("selected_features_stem_windowed_20_weight.csv")
passengers_stem_features = select_features(stem_features, y_passengers_stem_windowed)
passengers_stem_features.to_csv("selected_features_stem_windowed_20_passengers.csv")
weight_deck_features = select_features(deck_features, y_weight_deck_windowed)
weight_deck_features.to_csv("selected_features_deck_windowed_20_weight.csv")
passengers_deck_features = select_features(deck_features, y_passengers_deck_windowed)
passengers_deck_features.to_csv("selected_features_deck_windowed_20_passengers.csv")

### Pipeline

In [None]:
cachedir = mkdtemp()
memory = joblib.Memory(location=cachedir, verbose=10)

pipeline = sklearn.pipeline.Pipeline([#("align", sk.preprocessing.FunctionTransformer(align))
                                 ("windowing", skt.forecasting.model_selection.SlidingWindowSplitter),
                                 ("simplify_paths", sk.preprocessing.FunctionTransformer(simplify_path)),
                                 ("extract_features", sk.preprocessing.FunctionTransformer(extract_features)),
                                 ("pca", sk.decomposition.PCA()),
                                 #("select_features", sk.preprocessing.FunctionTransformer(select_features)),
                                 ("logistic_regression", sk.linear_model.LogisticRegression(max_iter=10000, tol=0.1))],
                                memory=memory)

### Cross Validation

In [None]:
param_grid = {
    "window__min_timeshift": [0, 10, 100, 1000, 10000],
    "window__max_timeshift": [None, 10, 100, 1000, 10000],
    "simplify_paths": [0, 1e-6, 1e-9, 1e-12, 1e-15],
    "pca__n_components": [5, 15, 30, 45, 64],
    "logistic__C": np.logspace(-4, 4, 4),
}
search = sk.model_selection.GridSearchCV(pipeline, param_grid, n_jobs=-1)
search.fit(pd.DataFrame, y_passenger)


TypeError: ignored

### Fit

#### Weight Fit

In [None]:
search.fit(X, y_weight)

####Passenger Fit

In [None]:
search.fit(X, y_passenger)