In [69]:
import numpy as np
import pandas as pd
import pickle

from os import listdir
from os.path import join, isdir
from sklearn import model_selection, svm
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [70]:
# отримаємо список папок з данними
path = 'data'
dirs = [data for data in os.listdir(path) if os.path.isdir(os.path.join(path, data))]
dirs

['idle', 'running', 'stairs', 'walking']

In [71]:
# дізнаємось к-сть файлів в кожній з папок
for d in dirs:
    path_to_dir = os.path.join(path, d)
    length = len(os.listdir(path_to_dir))
    print(f'{d}: {length}')

idle: 1039
running: 3408
stairs: 165
walking: 1850


In [72]:
def read_current_csv(i, j):
    """
    _____________________________
    i - number of dir 
    0 - idle
    1 - running
    2 - stairs
    3 - walking
    _____________________________
    j - number of csv file in dir
    j є (0, 3407)
    _____________________________
    return  head(5) of data
    """
    dirs = ["idle", "running", "stairs", "walking"]  
    path = "data"  
    directory_path = os.path.join(path, dirs[i])
    list_of_files = os.listdir(directory_path)

    if j < 0 or j >= len(list_of_files):
        raise IndexError(f" j must be in 0 to {len(list_of_files) - 1}")
    
    file_path = os.path.join(directory_path, list_of_files[j])
    
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"file {file_path} do not exists.")
    
    frame = pd.read_csv(file_path)
    return frame.head(5)

print(read_current_csv(0, 0))

   accelerometer_X  accelerometer_Y  accelerometer_Z
0         1.000776         4.616021         8.576031
1         0.718261         4.209007         8.446744
2        -0.909797        -0.282516         9.203311
3         5.099650         0.148441         8.418014
4         1.762132        -0.162806         9.251195


Підготовка даних

In [73]:
def get_statistics(frame):
    features = np.array([])
    features = np.concatenate((features, frame.max(axis=0).values), axis=0)
    features = np.concatenate((features, frame.min(axis=0).values), axis=0)
    features = np.concatenate((features, frame.mean(axis=0).values), axis=0)
    features = np.concatenate((features, frame.std(axis=0).values), axis=0)
    features = np.concatenate((features, frame.var(axis=0).values), axis=0)
    features = np.concatenate((features, frame.median(axis=0).values), axis=0)
    features = np.concatenate((features, frame.idxmax(axis=0).values), axis=0)
    features = np.concatenate((features, frame.idxmin(axis=0).values), axis=0)
    correlations = frame.corr()
    corr = np.array([correlations['accelerometer_X']['accelerometer_Y'], correlations['accelerometer_X']['accelerometer_Z'],
                     correlations['accelerometer_Y']['accelerometer_Z']])
    features = np.concatenate((features, corr), axis=0)

    frame['mean_X'] = frame.mean(axis=0)['accelerometer_X']
    frame['mean_Y'] = frame.mean(axis=0)['accelerometer_Y']
    frame['mean_Z'] = frame.mean(axis=0)['accelerometer_Z']
    
    mae_X = mean_absolute_error(frame['accelerometer_X'], frame['mean_X'])
    mae_Y = mean_absolute_error(frame['accelerometer_Y'], frame['mean_Y'])
    mae_Z = mean_absolute_error(frame['accelerometer_Z'], frame['mean_Z'])
    
    rmse_x = np.sqrt(mean_squared_error(frame['accelerometer_X'], frame['mean_X']))
    rmse_y = np.sqrt(mean_squared_error(frame['accelerometer_Y'], frame['mean_Y']))
    rmse_z = np.sqrt(mean_squared_error(frame['accelerometer_Z'], frame['mean_Z']))

    metrics = np.array([mae_X, mae_Y, mae_Z, rmse_x, rmse_y, rmse_z])
    features = np.concatenate((features, metrics), axis=0)

    return features

In [74]:
len(get_statistics(read_current_csv(0, 0)))

33

In [83]:
# підготовка класу
def class_data_stat_prepare(class_name, class_number):
    path_data = join(path, class_name)
    X = []
    for item in listdir(path_data):
        frame = pd.read_csv(join(path_data, item))
        features = get_statistics(frame)
        X.append(features)

    y = [class_number]*len(X)

    X = np.array(X)
    y = np.array(y)

    return X, y

In [84]:
def create_dataset(class_prepare):
    X_idle, y_idle = class_prepare('idle', 0)
    X_walking, y_walking = class_prepare('walking', 1)
    X_stairs, y_stairs = class_prepare('stairs', 2)
    X_running, y_running = class_prepare('running', 3)

    X = np.concatenate((X_idle, X_walking), axis=0)
    X = np.concatenate((X, X_stairs), axis=0)
    X = np.concatenate((X, X_running), axis=0)

    Y = np.concatenate((y_idle, y_walking), axis=0)
    Y = np.concatenate((Y, y_stairs), axis=0)
    Y = np.concatenate((Y, y_running), axis=0)

    return X, Y

In [85]:
X, y = create_dataset(class_data_stat_prepare)

In [86]:
def save_data(prefix, X, y):
    with open(f'{prefix}_X.pickle', 'wb') as f:
        pickle.dump(X, f)
    with open(f'{prefix}_y.pickle', 'wb') as f:
        pickle.dump(y, f)

In [87]:
save_data('data', X, y)

In [88]:
def load_data(prefix):
    with open(f'{prefix}_X.pickle', 'rb') as f:
        X = pickle.load(f)
    with open(f'{prefix}_y.pickle', 'rb') as f:
        y = pickle.load(f)
    return X, y

In [89]:
X, y = load_data('data')

In [92]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.3)

In [93]:
# порівнння VM Classifier VS RandomForest Classifier
cls_ovo = svm.SVC(decision_function_shape='ovo', kernel='rbf', gamma=0.005, probability=True).fit(X_train, y_train)
cls_ovr = svm.SVC(decision_function_shape='ovr', kernel='rbf', gamma=0.005, probability=True).fit(X_train, y_train)

cls_forest = RandomForestClassifier().fit(X_train, y_train)
svm_ovo_pred = cls_ovo.predict(X_test)
svm_ovr_pred = cls_ovr.predict(X_test)

forest_pred = cls_forest.predict(X_test)
svm_ovo_accuracy = accuracy_score(y_test, svm_ovo_pred)
svm_ovr_accuracy = accuracy_score(y_test, svm_ovr_pred)

forest_ovr_accuracy = accuracy_score(y_test, forest_pred)

print("accuracy SVM ovo: ", svm_ovo_accuracy)
print("accuracy SVM ovr: ", svm_ovr_accuracy)
print("accuracy RandomForest: ", forest_ovr_accuracy)

accuracy SVM ovo:  0.9131299734748011
accuracy SVM ovr:  0.9131299734748011
accuracy RandomForest:  0.9942528735632183


In [94]:
svm_ovo_report = classification_report(y_test, svm_ovo_pred)
print("SVM ovo report: ")
print(svm_ovo_report)

SVM ovo report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       738
           1       0.98      0.76      0.86      1309
           2       1.00      0.42      0.59       114
           3       0.86      1.00      0.93      2363

    accuracy                           0.91      4524
   macro avg       0.96      0.79      0.84      4524
weighted avg       0.92      0.91      0.91      4524



In [95]:
svm_ovr_report = classification_report(y_test, svm_ovr_pred)
print("SVM ovr report: ")
print(svm_ovr_report)

SVM ovr report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       738
           1       0.98      0.76      0.86      1309
           2       1.00      0.42      0.59       114
           3       0.86      1.00      0.93      2363

    accuracy                           0.91      4524
   macro avg       0.96      0.79      0.84      4524
weighted avg       0.92      0.91      0.91      4524



In [96]:
forest_report = classification_report(y_test, forest_pred)
print("RandomForest report: ")
print(forest_report)

RandomForest report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       738
           1       0.98      1.00      0.99      1309
           2       0.97      0.80      0.88       114
           3       1.00      1.00      1.00      2363

    accuracy                           0.99      4524
   macro avg       0.99      0.95      0.97      4524
weighted avg       0.99      0.99      0.99      4524

