# Clean data

In [4]:
import os
import zipfile
import pandas as pd
import numpy as np

In [5]:
with zipfile.ZipFile('homework.zip') as zip_ref:
  zip_ref.extractall()

In [6]:
data = []
base_dir = 'data'
for dir in os.listdir(base_dir):
  for file in os.listdir(os.path.join(base_dir, dir)):
    df = pd.read_csv(os.path.join(base_dir, dir, file))
    row = {}
    for column in df.columns:
      axis = column[-1]
      row.update({axis + '_mean' : df[column].mean()})
      row.update({axis + '_variance' : df[column].var()})
      row.update({axis + '_std' : df[column].std()})
      row.update({axis + '_median' : df[column].median()})
      row.update({axis + '_max' : df[column].max()})
      row.update({axis + '_min' : df[column].min()})
      row.update({axis + '_rms' : np.sqrt((df[column] ** 2).mean())})
    status = {'idle': 0, 'running': 1, 'stairs': 2, 'walking': 3}
    row.update({'status': status[dir]})
    data.append(row)


data = pd.DataFrame(data)

data

Unnamed: 0,X_mean,X_variance,X_std,X_median,X_max,X_min,X_rms,Y_mean,Y_variance,Y_std,...,Y_min,Y_rms,Z_mean,Z_variance,Z_std,Z_median,Z_max,Z_min,Z_rms,status
0,0.271502,0.002045,0.045220,0.270545,0.411803,0.201113,0.275119,-0.019313,0.002613,0.051118,...,-0.138864,0.053842,9.768822,0.000446,0.021130,9.773131,9.806650,9.715671,9.768844,0
1,0.244528,0.000995,0.031536,0.244209,0.301669,0.134075,0.246486,-0.030167,0.001004,0.031692,...,-0.124498,0.043370,9.765630,0.000392,0.019790,9.768343,9.797073,9.720459,9.765649,0
2,0.238303,0.000752,0.027426,0.244209,0.282516,0.134075,0.239824,-0.021388,0.000632,0.025142,...,-0.095768,0.032688,9.763554,0.000313,0.017694,9.768343,9.797073,9.720459,9.763570,0
3,-0.223618,0.273259,0.522742,-0.110133,0.076614,-2.901772,0.560496,-0.039265,0.138475,0.372122,...,-1.776498,0.367968,9.603143,0.642487,0.801553,9.758766,9.945514,5.396531,9.635426,0
4,0.259691,0.000166,0.012866,0.258574,0.287304,0.234632,0.259999,-0.018994,0.000281,0.016753,...,-0.052672,0.025141,9.768503,0.000208,0.014419,9.768343,9.797073,9.739613,9.768513,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6457,-2.610797,38.447929,6.200639,-1.280898,8.116344,-14.532804,6.631937,10.052295,107.650887,10.375495,...,-9.983821,14.321703,-2.056938,56.300529,7.503368,-1.848324,14.537593,-15.840038,7.658645,1
6458,6.554686,66.781292,8.171982,4.661511,23.060951,-6.124367,10.369144,6.727866,105.642718,10.278264,...,-10.630256,12.140242,3.784754,69.217228,8.319689,1.264139,33.298557,-5.377377,9.013010,1
6459,7.829040,127.984789,11.313036,6.162676,39.188293,-7.153875,13.601930,10.716447,186.969273,13.673671,...,-13.905523,17.192417,1.055524,23.217001,4.818402,1.065420,9.691729,-9.366117,4.853579,1
6460,9.310093,150.521828,12.268734,6.744466,39.188293,-7.105990,15.237528,9.330045,230.871581,15.194459,...,-20.053835,17.613222,-0.379880,32.220186,5.676283,0.998382,10.055647,-14.958972,5.593790,1


# Create models

## Split data to train and test

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

In [8]:
X = data.loc[:, 'X_mean':'Z_rms']
y = data['status']
# split our data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## SVM model

In [9]:
# make grid search
grid_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf']}
grid_search = GridSearchCV(SVC(), grid_params, cv=10)
grid_search.fit(X_train, y_train)

clf = grid_search.best_estimator_

clf

In [10]:
y_pred_svm = clf.predict(X_test)

## Random Forest model

In [11]:
# make grid search
grid_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# grid search took 17 minutes so I comment it and just create clf with shown best params

#grid_search = GridSearchCV(RandomForestClassifier(random_state=42), grid_params, cv=10, n_jobs=-1)
#grid_search.fit(X_train, y_train)
#clf = grid_search.best_estimator_
clf = RandomForestClassifier(max_depth=20, n_estimators=200, random_state=42)
clf.fit(X, y)

In [12]:
y_pred_random_forest = clf.predict(X_test)

# Metrics

In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

## SVM

In [14]:
accuracy = accuracy_score(y_test, y_pred_svm)
confusion_mat = confusion_matrix(y_test, y_pred_svm)
class_report = classification_report(y_test, y_pred_svm)

print("\nAccuracy:", accuracy, end='\n' + '-' * 60)
print("\nConfusion Matrix:\n", confusion_mat, end='\n' + '-' * 60)
print("\nClassification Report:\n", class_report, end='\n' + '-' * 60)


Accuracy: 0.9902011346054668
------------------------------------------------------------
Confusion Matrix:
 [[ 317    0    0    0]
 [   0 1013    0    0]
 [   0    0   44   17]
 [   0    0    2  546]]
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       317
           1       1.00      1.00      1.00      1013
           2       0.96      0.72      0.82        61
           3       0.97      1.00      0.98       548

    accuracy                           0.99      1939
   macro avg       0.98      0.93      0.95      1939
weighted avg       0.99      0.99      0.99      1939

------------------------------------------------------------

## Random forest

In [15]:
accuracy = accuracy_score(y_test, y_pred_random_forest)
confusion_mat = confusion_matrix(y_test, y_pred_random_forest)
class_report = classification_report(y_test, y_pred_random_forest)

print("\nAccuracy:", accuracy, end='\n' + '-' * 60)
print("\nConfusion Matrix:\n", confusion_mat, end='\n' + '-' * 60)
print("\nClassification Report:\n", class_report, end='\n' + '-' * 60)


Accuracy: 1.0
------------------------------------------------------------
Confusion Matrix:
 [[ 317    0    0    0]
 [   0 1013    0    0]
 [   0    0   61    0]
 [   0    0    0  548]]
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       317
           1       1.00      1.00      1.00      1013
           2       1.00      1.00      1.00        61
           3       1.00      1.00      1.00       548

    accuracy                           1.00      1939
   macro avg       1.00      1.00      1.00      1939
weighted avg       1.00      1.00      1.00      1939

------------------------------------------------------------

# Conclusion

1.   **Accurancy:**

SVM - 99.02%, Random forest - 100%. Both models predict with more than 99%. But Random forest makes no mistake.
1.   **Precision:**

```
precision = true_positive / (true_positive + false_positive)
```

SVM - idle:100%, running:100%, stairs: 96%, walking:97%;
Random forest - idle:100%, running:100%, stairs: 100%, walking:100%;
Worst prediction of SVM model with stairs status. That because in our data we have lowest count of this case. Random forest works execelent.
Random Forest - accurancy is perfect.  There are no false positives for each class.
1.   **Recall:**

```
recall = true_positive / (true_positive + false_negative)
```

SVM - idle:100%, running:100%, stairs: 72%, walking:100%;
Random forest - idle:100%, running:100%, stairs: 100%, walking:100%;
In SVM recall lower than precision. That means that this model is cautious in making positive predictions. And when it predicts something as positive, it's likely to be correct. but it might not capture all positive instances in the dataset.
In Random forest recall is perfect. There are no false negatives for each class.