In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset_path = Path('/content/drive/MyDrive/data')
csv_files = dataset_path.glob('**/*.csv')

dataframes = []

for file_path in csv_files:
    activity = file_path.parent.name
    df = pd.read_csv(file_path)
    df['activity'] = activity
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,activity
0,-4.031835,-6.871359,3.280056,running
1,0.354342,10.658986,-0.684742,running
2,-39.188293,24.593240,11.042058,running
3,-3.543418,1.666364,-5.267244,running
4,5.358224,5.401319,-0.215478,running
...,...,...,...,...
194455,-8.140285,-11.860875,-4.539406,walking
194456,-8.418014,-15.811308,-5.013458,walking
194457,-14.346057,-13.153744,-9.998186,walking
194458,-2.078167,-12.756307,-0.387861,walking


#### Calculation of time domain features for each axis of the accelerometer:

In [4]:
time_features = combined_df.groupby('activity').agg({'accelerometer_X': ['mean', 'std', 'median', 'max', 'min', 'sum'],
                                                     'accelerometer_Y': ['mean', 'std', 'median', 'max', 'min', 'sum'],
                                                     'accelerometer_Z': ['mean', 'std', 'median', 'max', 'min', 'sum']})

# Renaming columns
time_features.columns = ['_'.join(col) for col in time_features.columns]

time_features['activity'] = time_features.index
activity_col = time_features.pop('activity')
time_features.insert(0, 'activity', activity_col)

time_features.head()

Unnamed: 0_level_0,activity,accelerometer_X_mean,accelerometer_X_std,accelerometer_X_median,accelerometer_X_max,accelerometer_X_min,accelerometer_X_sum,accelerometer_Y_mean,accelerometer_Y_std,accelerometer_Y_median,accelerometer_Y_max,accelerometer_Y_min,accelerometer_Y_sum,accelerometer_Z_mean,accelerometer_Z_std,accelerometer_Z_median,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_sum
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
idle,idle,0.096767,0.691892,0.234632,8.135497,-10.448297,3016.239864,2.225971,2.821235,0.02873,9.916783,-2.595315,69383.508951,8.973247,1.211058,9.749189,21.988348,-0.296881,279696.094989
running,running,5.047616,9.948757,3.610456,39.188293,-39.188293,519096.79767,8.08425,12.43978,7.967903,39.188293,-39.188293,831384.316791,1.386967,7.002776,0.407014,39.188293,-39.188293,142635.669558
stairs,stairs,0.353135,3.845949,0.483629,14.164097,-11.238382,1748.016884,-9.574353,4.922294,-8.944737,4.994305,-25.828648,-47393.045005,-1.841333,5.297376,-1.58975,17.492037,-38.08696,-9114.596029
walking,walking,-2.661943,4.648378,-2.298433,20.57577,-26.81506,-147737.820092,-9.703702,5.931165,-9.974244,14.441825,-39.188293,-538555.441522,-1.126623,6.927679,-0.933739,38.426937,-39.16914,-62527.593189


#### We reduce the data to one format using normalization:

In [5]:
columns_to_normalize = ['accelerometer_X', 'accelerometer_Y', 'accelerometer_Z']

scaler = MinMaxScaler()
combined_df[columns_to_normalize] = scaler.fit_transform(combined_df[columns_to_normalize])

combined_df

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,activity
0,0.448558,0.412329,0.541850,running
1,0.504521,0.635997,0.491263,running
2,0.000000,0.813783,0.640885,running
3,0.454790,0.521261,0.432796,running
4,0.568365,0.568915,0.497251,running
...,...,...,...,...
194455,0.396139,0.348668,0.442082,walking
194456,0.392595,0.298265,0.436034,walking
194457,0.316960,0.332173,0.372434,walking
194458,0.473485,0.337243,0.495051,walking


#### Let's divide the data into training and testing sets:

In [6]:
X = combined_df[['accelerometer_X', 'accelerometer_Y', 'accelerometer_Z']]
y = combined_df['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (136122, 3)
Shape of X_test: (58338, 3)
Shape of y_train: (136122,)
Shape of y_test: (58338,)


#### Creating SVM model:

In [8]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

#### Creating Random forest model:

In [9]:
rf_model = RandomForestClassifier(n_jobs=-1)
rf_model.fit(X_train, y_train)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

#### Comparing the accuracy of models:

In [10]:
score_svm = svm_model.score(X_test, y_test)
score_rf = rf_model.score(X_test, y_test)

print(f"Accuracy of the SVM model:", score_svm)
print(f"Accuracy of the RF model:", score_rf)

Accuracy of the SVM model: 0.8922314786245672
Accuracy of the RF model: 0.9996914532551682


#### We compare the performance results of both algorithms on different features using the 'classification report' method:

In [11]:
# SVM predictions
svm_predictions = svm_model.predict(X_test)

# RF predicitons
rf_predictions = rf_model.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

SVM Classification Report:
              precision    recall  f1-score   support

        idle       0.96      0.99      0.97      9326
     running       0.93      0.90      0.92     30741
      stairs       1.00      0.00      0.01      1497
     walking       0.80      0.90      0.85     16774

    accuracy                           0.89     58338
   macro avg       0.92      0.70      0.69     58338
weighted avg       0.90      0.89      0.88     58338

Random Forest Classification Report:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00      9326
     running       1.00      1.00      1.00     30741
      stairs       1.00      0.99      1.00      1497
     walking       1.00      1.00      1.00     16774

    accuracy                           1.00     58338
   macro avg       1.00      1.00      1.00     58338
weighted avg       1.00      1.00      1.00     58338



#### The Random Forest model has significantly better indicators than the SVM model, in particular in terms of accuracy, macro avg precision, macro avg weighted precision, and f1 score.
#### So let's summarize: both models achieve a fairly high level of accuracy, but the random forest model turned out to be more reliable, it has better accuracy indicators and greater completeness on all classes.