<a href="https://colab.research.google.com/github/Danil-Kazakov/Data-HW5/blob/main/Data_HW5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [4]:
folder = '/content/drive/MyDrive/content/data'
data_set = pd.DataFrame()

for activity_folder in os.listdir(folder):
    activity_path = os.path.join(folder, activity_folder)
    if not os.path.isdir(activity_path):
        continue

    for file in os.listdir(activity_path):
        file_path = os.path.join(activity_path, file)
        if not file.endswith('.csv'):
            continue

        df = pd.read_csv(file_path)
        df['activity'] = activity_folder
        data_set = pd.concat([data_set, df], ignore_index=True)

# Print the head of the dataset and value counts for activity
print(data_set.head())
print(data_set['activity'].value_counts())

   accelerometer_X  accelerometer_Y  accelerometer_Z activity
0        -6.478709       -22.160730         0.162806   stairs
1         5.272033       -25.546135        -8.652645   stairs
2        -2.264915       -17.348385        -2.834735   stairs
3        -7.426813         1.345541        -2.686294   stairs
4        -0.694318       -15.030798        -5.501875   stairs
activity
running    102240
walking     55500
idle        31170
stairs       4950
Name: count, dtype: int64


In [5]:
def add_stat_feature_frame(frame):
    features = []
    rows = frame.shape[0]
    for col_id in range(frame.shape[1]):
        col = frame.iloc[:, col_id]
        features.append(pd.DataFrame([col.mean()] * rows, columns=[f'{col.name}_mean']))
        features.append(pd.DataFrame([col.max()] * rows, columns=[f'{col.name}_max']))
        features.append(pd.DataFrame([col.min()] * rows, columns=[f'{col.name}_min']))
        features.append(pd.DataFrame([col.quantile(0.75) - col.quantile(0.25)] * rows, columns=[f'{col.name}_interquartile_range']))
        features.append(pd.DataFrame([col.idxmin()] * rows, columns=[f'{col.name}_index_of_minimum_value']))
        features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows, columns=[f'{col.name}_mean_of_absolute_deviation']))
        features.append(pd.DataFrame([col.median()] * rows, columns=[f'{col.name}_median']))
        features.append(pd.DataFrame([col.std()] * rows, columns=[f'{col.name}_standard_deviation']))
        features.append(pd.DataFrame([np.sqrt(np.mean(col) ** 2)] * rows, columns=[f'{col.name}_root_mean_square_error']))
    return pd.concat(features, axis=1)

# Adding time domain features
data_set_with_features = pd.concat([data_set.drop('activity', axis=1), add_stat_feature_frame(data_set.drop('activity', axis=1))], axis=1)


In [6]:
# Split data into features and target
X = data_set.drop('activity', axis=1)
y = data_set['activity']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split the dataset with features into training and test sets
X_with_features = data_set_with_features
y_with_features = data_set['activity']
X_train_wf, X_test_wf, y_train_wf, y_test_wf = train_test_split(X_with_features, y_with_features, test_size=0.3, random_state=42, stratify=y_with_features)


In [7]:
# Define models
models = {
    "SVC": SVC(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42)
}

# Train and evaluate models on data without additional features
for model_name, model in models.items():
    print(f"Training model: {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name}:\n")
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

# Train and evaluate models on data with additional features
for model_name, model in models.items():
    print(f"Training model with features: {model_name}")
    model.fit(X_train_wf, y_train_wf)
    y_pred_wf = model.predict(X_test_wf)
    print(f"Classification report for {model_name} with features:\n")
    print(classification_report(y_test_wf, y_pred_wf, digits=4))
    print(confusion_matrix(y_test_wf, y_pred_wf))


Training model: SVC
Classification report for SVC:

              precision    recall  f1-score   support

        idle     0.9568    0.9880    0.9722      9351
     running     0.9317    0.9007    0.9159     30672
      stairs     1.0000    0.0034    0.0067      1485
     walking     0.7980    0.9033    0.8474     16650

    accuracy                         0.8926     58158
   macro avg     0.9216    0.6988    0.6855     58158
weighted avg     0.8992    0.8926    0.8821     58158

[[ 9239    91     0    21]
 [  341 27625     0  2706]
 [   13   386     5  1081]
 [   63  1547     0 15040]]
Training model: RandomForestClassifier
Classification report for RandomForestClassifier:

              precision    recall  f1-score   support

        idle     0.9998    0.9999    0.9998      9351
     running     0.9996    1.0000    0.9998     30672
      stairs     1.0000    0.9892    0.9946      1485
     walking     0.9995    0.9997    0.9996     16650

    accuracy                         0.999

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        idle     0.0000    0.0000    0.0000      9351
     running     0.5274    1.0000    0.6906     30672
      stairs     0.0000    0.0000    0.0000      1485
     walking     0.0000    0.0000    0.0000     16650

    accuracy                         0.5274     58158
   macro avg     0.1318    0.2500    0.1726     58158
weighted avg     0.2781    0.5274    0.3642     58158

[[    0  9351     0     0]
 [    0 30672     0     0]
 [    0  1485     0     0]
 [    0 16650     0     0]]
Training model with features: RandomForestClassifier
Classification report for RandomForestClassifier with features:

              precision    recall  f1-score   support

        idle     0.9998    0.9999    0.9998      9351
     running     0.9996    1.0000    0.9998     30672
      stairs     1.0000    0.9892    0.9946      1485
     walking     0.9995    0.9997    0.9996     16650

    accuracy                         0.9996     58158
   macro avg