In [1]:
import pandas as pd
import keepsake
import numpy as np
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

plt.style.use('ggplot')

%matplotlib inline

In [2]:
# only need to be run once to initialize

# ! echo 'repository: "file://.keepsake"' > keepsake.yaml 

In [3]:
# Generate combinations
from itertools import combinations

comb_list = []

for features in [1,2,3]:
    for com in combinations(['gyro_x','gyro_y','gyro_z'], features):
        comb_list.append(f'acc_x|acc_y|acc_z|{"|".join(list(com))}|label')
comb_list

['acc_x|acc_y|acc_z|gyro_x|label',
 'acc_x|acc_y|acc_z|gyro_y|label',
 'acc_x|acc_y|acc_z|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_y|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_y|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_y|gyro_z|label']

In [4]:
# df_10hz = pd.read_csv('data/transformed/20210529_v2_data_all_10hz.csv')
df_20hz = pd.read_csv('data/transformed/20210529_v2_data_all_20hz.csv')
df_25hz = pd.read_csv('data/transformed/20210529_v2_data_all_25hz.csv')
df_50hz = pd.read_csv('data/transformed/20210529_v2_data_all_50hz.csv')
# df_100hz = pd.read_csv('data/transformed/20210529_v2_data_all_100hz.csv')

In [5]:
def get_df_base(df):
    df = df[(df['shift'] == 0)]
    return df.dropna(axis=0)

In [6]:
df_20hz = get_df_base(df_20hz)
df_25hz = get_df_base(df_25hz)
df_50hz = get_df_base(df_50hz)

In [7]:
def save_model_optimized(classifier, stage, dataset, model_type, exp_id):
    '''
        Saves model to defined folder.

        stage - baseline/optimized
        dataset - base/centered/end/etc
        model_types - decision_tree, random_forest, ...
        hz - frequency
    '''

    import os
    import m2cgen as m2c
    
    BASE_PATH = f'models/{stage}/{dataset}/{model_type}/'
    FILE_NAME = f'{model_type}_{exp_id}.py'

    if not os.path.exists(BASE_PATH):
        os.makedirs(BASE_PATH)

    code = m2c.export_to_python(classifier)
    with open(BASE_PATH + FILE_NAME, 'w') as f:
        f.writelines(code)
        
    return BASE_PATH + FILE_NAME

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

is_save_model = True
model_type = 'random_forest'
stage = 'optimized'
dataset = 'base'
quantization = None
estimators = None

cutoff=0.99
dataset_test_sizes = [0.35]
datasets_setup = [(df_20hz, 20), (df_25hz, 25), (df_50hz, 50)]

for df_t in datasets_setup:
    for comb in comb_list:
        for dataset_test_size in dataset_test_sizes:
            df_filtered = df_t[0].filter(regex=comb)
            X_train, X_test, y_train, y_test = train_test_split(
                df_filtered.drop('label',axis=1), df_filtered['label'], test_size=dataset_test_size, random_state=42)

            for estimators in [4,5,6]:
                clf = RandomForestClassifier(n_jobs=-1, n_estimators=estimators, random_state=42)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)

                accuracy = metrics.accuracy_score(y_test, y_pred)
                f1 = metrics.f1_score(y_test, y_pred, average='macro')
                precision = metrics.precision_score(y_test, y_pred, average='macro')
                recall = metrics.recall_score(y_test, y_pred, average='macro')

                if recall > cutoff:
                    signals = comb.replace('|label','').split('|')
                    print(f"Signals: {signals} @ {df_t[1]} >> Acc: {accuracy}, Prec: {precision}, Recall: {recall}")
                    
                    if is_save_model:
                        experiment = keepsake.init(
                            params={
                                'model':model_type,
                                'features': signals,
                                'feature_count': len(signals),
                                'n_estimators': estimators,
                                'dataset_test_size': dataset_test_size,
                                'hz':df_t[1],
                                'data_set':dataset,
                                'quantization': quantization,
                                'other_params': 'default',
                                'accuracy': accuracy,
                                'precision':precision,
                                'recall':recall,
                                'f1':f1
                                })

                        path = save_model_optimized(clf, stage=stage, dataset=dataset, model_type=model_type, exp_id=experiment.id[:7])

                        experiment.checkpoint(
                            path=path,
                            metrics={"accurracy":accuracy, "f1": f1, "precision": precision, "recall": recall},
                            primary_metric=("recall","maximize")
                        )
                        experiment.stop()

[2m═══╡ [0mCreating experiment 78ad688...
[2m═══╡ [0mCreating checkpoint cf9d863, copying 'models/optimized/base/random_forest/random_forest_78ad688.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0
Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment d5da9fe...
[2m═══╡ [0mCreating checkpoint 5f27590, copying 'models/optimized/base/random_forest/random_forest_d5da9fe.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 5b65d1f...
[2m═══╡ [0mCreating checkpoint 16fff24, copying 'models/optimized/base/random_forest/random_forest_5b65d1f.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 2c811ab...
[2m═══╡ [0mCreating checkpoint c84f79e, copying 'models/optimized/base/random_forest/random_forest_2c811ab.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 25 >> Acc: 0.9875, Prec: 0.984375, Recall: 0.9924242424242424


[2m═══╡ [0mCreating experiment b159d83...
[2m═══╡ [0mCreating checkpoint 4327f17, copying 'models/optimized/base/random_forest/random_forest_b159d83.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 25 >> Acc: 0.9875, Prec: 0.984375, Recall: 0.9924242424242424


[2m═══╡ [0mCreating experiment 263279a...
[2m═══╡ [0mCreating checkpoint 0f3d3e5, copying 'models/optimized/base/random_forest/random_forest_263279a.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 628e029...
[2m═══╡ [0mCreating checkpoint d04857d, copying 'models/optimized/base/random_forest/random_forest_628e029.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 232d3ae...
[2m═══╡ [0mCreating checkpoint 0eab042, copying 'models/optimized/base/random_forest/random_forest_232d3ae.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment f28b45c...
[2m═══╡ [0mCreating checkpoint e6409a1, copying 'models/optimized/base/random_forest/random_forest_f28b45c.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

is_save_model = True
model_type = 'decision_tree'
stage = 'optimized'
dataset = 'base'
quantization = None
estimators = None

cutoff=0.99
dataset_test_sizes = [0.35]
datasets_setup = [(df_20hz, 20), (df_25hz, 25), (df_50hz, 50)]

for df_t in datasets_setup:
    for comb in comb_list:
        for dataset_test_size in dataset_test_sizes:
            df_filtered = df_t[0].filter(regex=comb)
            X_train, X_test, y_train, y_test = train_test_split(
                df_filtered.drop('label',axis=1), df_filtered['label'], test_size=dataset_test_size, random_state=42)

            clf = DecisionTreeClassifier(random_state=42)
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)

            accuracy = metrics.accuracy_score(y_test, y_pred)
            f1 = metrics.f1_score(y_test, y_pred, average='macro')
            precision = metrics.precision_score(y_test, y_pred, average='macro')
            recall = metrics.recall_score(y_test, y_pred, average='macro')

            if recall > cutoff:
                signals = comb.replace('|label','').split('|')
                print(f"Signals: {signals} @ {df_t[1]} >> Acc: {accuracy}, Prec: {precision}, Recall: {recall}")

                if is_save_model:
                    experiment = keepsake.init(
                        params={
                            'model':model_type,
                            'features': signals,
                            'feature_count': len(signals),
                            'n_estimators': estimators,
                            'dataset_test_size': dataset_test_size,
                            'hz':df_t[1],
                            'data_set':dataset,
                            'quantization': quantization,
                            'other_params': 'default',
                            'accuracy': accuracy,
                            'precision':precision,
                            'recall':recall,
                            'f1':f1
                            })

                    path = save_model_optimized(clf, stage=stage, dataset=dataset, model_type=model_type, exp_id=experiment.id[:7])

                    experiment.checkpoint(
                        path=path,
                        metrics={"accurracy":accuracy, "f1": f1, "precision": precision, "recall": recall},
                        primary_metric=("recall","maximize")
                    )
                    experiment.stop()

Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 30462fd...
[2m═══╡ [0mCreating checkpoint eafbd06, copying 'models/optimized/base/decision_tree/decision_tree_30462fd.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 36284c7...
[2m═══╡ [0mCreating checkpoint 727383b, copying 'models/optimized/base/decision_tree/decision_tree_36284c7.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 4edbe66...
[2m═══╡ [0mCreating checkpoint 55f4a25, copying 'models/optimized/base/decision_tree/decision_tree_4edbe66.py' to 'file:///Users/tk/projects/tinyml-esp32-data/.keepsake' in the background...


In [12]:
keepsake.experiments.list(lambda exp: exp.params['accuracy'] == 1.0 and exp.params['model'] == 'random_forest')

id,created,params,latest_checkpoint,best_checkpoint
78ad688,2021-06-11 18:36:04.393607,"{'recall': 1, 'n_estimators': 4, 'hz': 20, 'precision': 1, 'accuracy': 1, 'quantization': None, 'model': 'random_forest', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'data_set': 'base', 'other_params': 'default', 'f1': 1, 'dataset_test_size': 0.35, 'feature_count': 5}",cf9d863 (step 0; recall: 1),cf9d863 (step 0; recall: 1)
d5da9fe,2021-06-11 18:36:05.494630,"{'model': 'random_forest', 'accuracy': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'recall': 1, 'feature_count': 5, 'n_estimators': 5, 'other_params': 'default', 'precision': 1, 'quantization': None, 'f1': 1, 'dataset_test_size': 0.35, 'hz': 20, 'data_set': 'base'}",5f27590 (step 0; recall: 1),5f27590 (step 0; recall: 1)
5b65d1f,2021-06-11 18:36:06.563546,"{'f1': 1, 'quantization': None, 'recall': 1, 'data_set': 'base', 'feature_count': 5, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'dataset_test_size': 0.35, 'hz': 20, 'model': 'random_forest', 'accuracy': 1, 'other_params': 'default', 'precision': 1, 'n_estimators': 6}",16fff24 (step 0; recall: 1),16fff24 (step 0; recall: 1)
2c811ab,2021-06-11 18:36:07.637670,"{'model': 'random_forest', 'quantization': None, 'precision': 1, 'feature_count': 5, 'recall': 1, 'hz': 20, 'data_set': 'base', 'accuracy': 1, 'f1': 1, 'n_estimators': 4, 'other_params': 'default', 'dataset_test_size': 0.35, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z']}",c84f79e (step 0; recall: 1),c84f79e (step 0; recall: 1)
628e029,2021-06-11 18:36:11.523512,"{'f1': 1, 'accuracy': 1, 'model': 'random_forest', 'hz': 50, 'recall': 1, 'n_estimators': 6, 'precision': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'quantization': None, 'dataset_test_size': 0.35, 'feature_count': 5, 'data_set': 'base', 'other_params': 'default'}",d04857d (step 0; recall: 1),d04857d (step 0; recall: 1)
232d3ae,2021-06-11 18:36:12.643836,"{'accuracy': 1, 'model': 'random_forest', 'recall': 1, 'feature_count': 5, 'f1': 1, 'hz': 50, 'precision': 1, 'quantization': None, 'data_set': 'base', 'n_estimators': 6, 'dataset_test_size': 0.35, 'other_params': 'default', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z']}",0eab042 (step 0; recall: 1),0eab042 (step 0; recall: 1)
f28b45c,2021-06-11 18:36:13.792155,"{'quantization': None, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'], 'data_set': 'base', 'accuracy': 1, 'model': 'random_forest', 'n_estimators': 5, 'other_params': 'default', 'feature_count': 6, 'f1': 1, 'dataset_test_size': 0.35, 'precision': 1, 'recall': 1, 'hz': 50}",e6409a1 (step 0; recall: 1),e6409a1 (step 0; recall: 1)


In [13]:
keepsake.experiments.list(lambda exp: exp.params['accuracy'] == 1.0 and exp.params['model'] == 'decision_tree')

id,created,params,latest_checkpoint,best_checkpoint
30462fd,2021-06-11 18:36:14.962589,"{'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'accuracy': 1, 'dataset_test_size': 0.35, 'precision': 1, 'data_set': 'base', 'model': 'decision_tree', 'quantization': None, 'other_params': 'default', 'feature_count': 4, 'recall': 1, 'f1': 1, 'hz': 25, 'n_estimators': None}",eafbd06 (step 0; recall: 1),eafbd06 (step 0; recall: 1)
36284c7,2021-06-11 18:36:16.018638,"{'other_params': 'default', 'accuracy': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'feature_count': 4, 'hz': 25, 'model': 'decision_tree', 'dataset_test_size': 0.35, 'precision': 1, 'quantization': None, 'data_set': 'base', 'recall': 1, 'f1': 1, 'n_estimators': None}",727383b (step 0; recall: 1),727383b (step 0; recall: 1)
4edbe66,2021-06-11 18:36:17.069445,"{'model': 'decision_tree', 'data_set': 'base', 'other_params': 'default', 'accuracy': 1, 'recall': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'feature_count': 4, 'dataset_test_size': 0.35, 'f1': 1, 'quantization': None, 'n_estimators': None, 'hz': 25, 'precision': 1}",55f4a25 (step 0; recall: 1),55f4a25 (step 0; recall: 1)
