## Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#Linear Regression
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
#Support Vector Machine
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#Neural Network
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam
#Logistic Regression
from sklearn.linear_model import LogisticRegression

## Load dataset

In [None]:
dff = pd.read_csv('/content/mhealth_raw_data.csv')
display(dff)

Unnamed: 0,alx,aly,alz,glx,gly,glz,arx,ary,arz,grx,gry,grz,Activity,subject
0,2.1849,-9.6967,0.63077,0.103900,-0.84053,-0.68762,-8.6499,-4.5781,0.187760,-0.449020,-1.01030,0.034483,0,subject1
1,2.3876,-9.5080,0.68389,0.085343,-0.83865,-0.68369,-8.6275,-4.3198,0.023595,-0.449020,-1.01030,0.034483,0,subject1
2,2.4086,-9.5674,0.68113,0.085343,-0.83865,-0.68369,-8.5055,-4.2772,0.275720,-0.449020,-1.01030,0.034483,0,subject1
3,2.1814,-9.4301,0.55031,0.085343,-0.83865,-0.68369,-8.6279,-4.3163,0.367520,-0.456860,-1.00820,0.025862,0,subject1
4,2.4173,-9.3889,0.71098,0.085343,-0.83865,-0.68369,-8.7008,-4.1459,0.407290,-0.456860,-1.00820,0.025862,0,subject1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1215740,1.7849,-9.8287,0.29725,-0.341370,-0.90056,-0.61493,-3.7198,-8.9071,0.294230,0.041176,-0.99384,-0.480600,0,subject10
1215741,1.8687,-9.8766,0.46236,-0.341370,-0.90056,-0.61493,-3.7160,-8.7455,0.448140,0.041176,-0.99384,-0.480600,0,subject10
1215742,1.6928,-9.9290,0.16631,-0.341370,-0.90056,-0.61493,-3.8824,-9.1155,0.450480,0.041176,-0.99384,-0.480600,0,subject10
1215743,1.5279,-9.6306,0.30458,-0.341370,-0.90056,-0.61493,-3.5564,-9.1441,0.594880,0.041176,-0.99384,-0.480600,0,subject10


## preprocessing

In [None]:
dff.duplicated().sum()

0

In [None]:
dff.isnull().sum()

alx         0
aly         0
alz         0
glx         0
gly         0
glz         0
arx         0
ary         0
arz         0
grx         0
gry         0
grz         0
Activity    0
subject     0
dtype: int64

In [None]:
# Fill null values in numeric columns with mode
numeric_cols = dff.select_dtypes(include=[np.number]).columns
dff[numeric_cols] = dff[numeric_cols].apply(lambda col: col.fillna(col.mode()[0]))

# Check for any remaining null values
dff.isnull().sum()

alx         0
aly         0
alz         0
glx         0
gly         0
glz         0
arx         0
ary         0
arz         0
grx         0
gry         0
grz         0
Activity    0
subject     0
dtype: int64

In [None]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215745 entries, 0 to 1215744
Data columns (total 14 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   alx       1215745 non-null  float64
 1   aly       1215745 non-null  float64
 2   alz       1215745 non-null  float64
 3   glx       1215745 non-null  float64
 4   gly       1215745 non-null  float64
 5   glz       1215745 non-null  float64
 6   arx       1215745 non-null  float64
 7   ary       1215745 non-null  float64
 8   arz       1215745 non-null  float64
 9   grx       1215745 non-null  float64
 10  gry       1215745 non-null  float64
 11  grz       1215745 non-null  float64
 12  Activity  1215745 non-null  int64  
 13  subject   1215745 non-null  object 
dtypes: float64(12), int64(1), object(1)
memory usage: 129.9+ MB


In [None]:
dff.drop(columns=['subject'], inplace=True)

In [None]:
# dff.describe()

## Data Splitting

In [None]:
# dff['Activity'].value_counts()

In [None]:
# # Take a random sample of size 10000
# sample_size = 10000
# df = dff.sample(n=sample_size, random_state=42)  # Use a specific random state for reproducibility

# X = df[['alx', 'aly', 'alz', 'glx', 'gly', 'glz', 'arx', 'ary', 'arz', 'grx', 'gry', 'grz']]
# y = df['Activity']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class_counts = dff['Activity'].value_counts()

#  minimum count among all classes
min_count = class_counts.min()

# Sample each class with a count equal to min_count
sampled_dfs = []
for activity, count in class_counts.items():
    sampled_df = dff[dff['Activity'] == activity].sample(min_count, random_state=42)
    sampled_dfs.append(sampled_df)

# Concatenate the sampled DataFrames to get a balanced sample
balanced_sample = pd.concat(sampled_dfs)


X = balanced_sample[['alx', 'aly', 'alz', 'glx', 'gly', 'glz', 'arx', 'ary', 'arz', 'grx', 'gry', 'grz']]
y = balanced_sample['Activity']



In [None]:
# Scaling:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Functions**

In [None]:
# empty dictionary
metrics_dict = {
    'accuracy': {},
    'precision': {},
    'recall': {},
    'f1_score': {},
    'confusion_matrix':{},
    'MSE' : {}
}

## KNN

In [None]:
# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=7)  # Set the number of neighbors to 7
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

# Store the accuracy
metrics_dict['accuracy']['KNN'] = accuracy
metrics_dict['precision']['KNN'] = precision
metrics_dict['recall']['KNN'] = recall
metrics_dict['f1_score']['KNN'] = f1
metrics_dict['confusion_matrix']['KNN'] = confusion


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Confusion Matrix:\n{confusion}\n")

# Interpretation of model predictions for KNN
print("\nExample predictions for KNN:")
for i in range(min(5, len(y_test))):  # Print 5 samples or less
    pred_value = y_pred[i]
    actual_value = y_test.iloc[i]
    print(f"Sample {i+1}: Predicted={pred_value}, Actual={actual_value}")


Accuracy: 0.9403495723317218
Precision: 0.9411102272719324
Recall: 0.9403495723317218
F1-score: 0.9360633304655549
Confusion Matrix:
[[1139   57   44   23  173   97   91  121  109  101   42   24   57]
 [   0 2035    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0 2090    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0 2105    0    0    0    0    0    0    0    0    0]
 [   4    0    0    0 2079    0    2    0    2    0    0    0    1]
 [  23    1    1    0   61 1911   11    2   32    1    2    0    3]
 [   2    3    0    0    0    0 1965   12    4    0    0    0    0]
 [   3    0    0    0    0    0    7 2056    5    1    0    0    0]
 [   3    0    0    0    2    0    8    9 2070    0    0    0    0]
 [   2    0    0    0    0    0    0    0    3 2022    0    0    1]
 [   3    0    0    0    6    1    0    3    1    0 1994   74   15]
 [   2    0    0    0   14    0    0    0    0    1   95 1988   20]
 [  30    0    0    0   24    7    3    4    6    1

## Linear Regression

In [None]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# predictions and Mean Squared Error
linear_regression_predictions = linear_regression.predict(X_test)
mse = mean_squared_error(y_test, linear_regression_predictions)

# Store the mse
metrics_dict['accuracy']['Linear Regression'] = None
metrics_dict['MSE']['Linear Regression'] = mse


print(f"Mean Squared Error: {mse}")

# Interpretation of model predictions for Lineaar Regression
print("\nExample predictions for Lineaar Regression:")
for i in range(min(5, len(y_test))):  # Print 5 samples or less
    pred_value = y_pred[i]
    actual_value = y_test.iloc[i]
    print(f"Sample {i+1}: Predicted={pred_value}, Actual={actual_value}")


Mean Squared Error: 10.68860940592738

Example predictions for Lineaar Regression:
Sample 1: Predicted=6, Actual=6
Sample 2: Predicted=1, Actual=1
Sample 3: Predicted=8, Actual=8
Sample 4: Predicted=1, Actual=1
Sample 5: Predicted=5, Actual=5


## SVM

In [None]:
# # Define the parameter grid for grid search
# param_grid = {
#         'C':[0.1,1,10,100],
#         'kernel': ['linear', 'rbf', 'poly']
# }

# #SVM
# svm = SVC()

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(svm, param_grid, cv=5)
# grid_search.fit(X_train, y_train)


# print("Best Hyperparameters: ", grid_search.best_params_)
# print("Best Accuracy: ", grid_search.best_score_)

# # Evaluation
# best_model = grid_search.best_estimator_
# test_accuracy = best_model.score(X_test, y_test)
# print("Test Accuracy: ", test_accuracy)



In [None]:
# Create and train the SVM model
svm = SVC(kernel='rbf', C=100)  # Use the RBF kernel
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

# Store the accuracy
metrics_dict['accuracy']['SVM'] = accuracy
metrics_dict['precision']['SVM'] = precision
metrics_dict['recall']['SVM'] = recall
metrics_dict['f1_score']['SVM'] = f1
metrics_dict['confusion_matrix']['SVM'] = confusion


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Confusion Matrix:\n{confusion}\n")

# Interpretation of model predictions for SVM
print("\nExample predictions for SVM:")
for i in range(min(5, len(y_test))):   # Print 5 samples or less
    pred_value = y_pred[i]
    actual_value = y_test.iloc[i]
    print(f"Sample {i+1}: Predicted={pred_value}, Actual={actual_value}")

Accuracy: 0.9534399404983265
Precision: 0.9515233359234337
Recall: 0.9534399404983265
F1-score: 0.9514069801651617
Confusion Matrix:
[[1367   62   42   26  109   79   73   61   93   65   24   19   58]
 [   1 2034    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0 2090    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0 2105    0    0    0    0    0    0    0    0    0]
 [  25    0    0    0 2056    3    1    0    3    0    0    0    0]
 [  86    0    0    0   25 1928    3    0    4    0    0    0    2]
 [   8    4    0    0    0    0 1963   11    0    0    0    0    0]
 [   3    0    0    0    0    0   17 2046    5    1    0    0    0]
 [  12    0    0    0    2    0    5    8 2063    1    0    0    1]
 [   7    0    0    0    0    0    0    0    0 2021    0    0    0]
 [  29    0    0    0    0    0    0    0    0    0 1997   58   13]
 [  26    0    0    0    1    0    0    0    0    0   53 2020   20]
 [  51    0    0    0    3    2    0    0    2    0

## Neural Network

In [None]:
# Define the neural network model
NN_model2 = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax'),
])

# Compile the model
NN_model2.compile(optimizer='sgd',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Train the model
hist = NN_model2.fit(X_train, y_train, batch_size=32, epochs=200)

# Make predictions using the trained model
y_pred_nn = NN_model2.predict(X_test)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)

# Evaluate the model
accuracy_nn =  hist.history['accuracy'][-1]
precision_nn = precision_score(y_test, y_pred_nn_classes, average='weighted')
recall_nn = recall_score(y_test, y_pred_nn_classes, average='weighted')
f1_nn = f1_score(y_test, y_pred_nn_classes, average='weighted')
confusion_nn = confusion_matrix(y_test, y_pred_nn_classes)

# Store the metrics in the same dictionary
metrics_dict['accuracy']['NN'] = accuracy_nn
metrics_dict['precision']['NN'] = precision_nn
metrics_dict['recall']['NN'] = recall_nn
metrics_dict['f1_score']['NN'] = f1_nn
metrics_dict['confusion_matrix']['NN'] = confusion_nn

print(f"Accuracy (NN): {accuracy_nn}")
print(f"Precision (NN): {precision_nn}")
print(f"Recall (NN): {recall_nn}")
print(f"F1-score (NN): {f1_nn}")
print(f"Confusion Matrix (NN):\n{confusion_nn}\n")

# Interpretation of model predictions for NN
print("\nExample predictions for NN:")
for i in range(min(5, len(y_test))):   # Print 5 samples or less
    pred_value = y_pred_nn_classes[i]
    actual_value = y_test.iloc[i]
    print(f"Sample {i+1}: Predicted={pred_value}, Actual={actual_value}")


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

## Logistic Regression

In [None]:
# Create and train the Logistic Regression model
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

# Store the accuracy
metrics_dict['accuracy']['Logistic Regression'] = accuracy
metrics_dict['precision']['Logistic Regression'] = precision
metrics_dict['recall']['Logistic Regression'] = recall
metrics_dict['f1_score']['Logistic Regression'] = f1
metrics_dict['confusion_matrix']['Logistic Regression'] = confusion


# Print evaluation metrics and confusion matrix
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Confusion Matrix:\n{confusion}\n")

# Interpretation of model predictions for Logistic Regression
print("\nExample predictions for Logistic Regression:")
for i in range(min(5, len(y_test))):   # Print 5 samples or less
    pred_value = y_pred[i]
    actual_value = y_test.iloc[i]
    print(f"Sample {i+1}: Predicted={pred_value}, Actual={actual_value}")


Accuracy: 0.5527333581256972
Precision: 0.5255857355729238
Recall: 0.5527333581256972
F1-score: 0.5331319399180312
Confusion Matrix:
[[  68  230  213   40  212   67  250  190  241  185  135  100  147]
 [   0 1368    0    0  227    0  430    0   10    0    0    0    0]
 [  29    0 1240    0    0  215    0  193    0  202    0   21  190]
 [   0    0    0 2105    0    0    0    0    0    0    0    0    0]
 [  35  258    4    0 1064  185  135    1  265    0    5   38   98]
 [ 170  124   85   13  375  599  166   47  298   39   17   24   91]
 [ 100  348    3    0   44    2 1263    1  225    0    0    0    0]
 [  57  208  198   31   71    8  147 1182    2  117    0    8   43]
 [  43  112    1    0  159  165  366    5 1134    4   42   15   46]
 [   4    0    2    0    6    1    1   49  104 1858    0    1    2]
 [  34    0   99    5   15   30    9   21   11    2 1145  360  366]
 [  44    5  196    7   65  108   36   46   50    9  179 1257  118]
 [  59   43  167   11  108   10   75  151   54   26

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Results Analysis

Best Model

In [None]:
# # Print evaluation metrics
# for metric, values in metrics_dict.items():
#     print(f"\n{metric.capitalize()}:\n")
#     for clf_name, value in values.items():
#         print(f"{clf_name}: {value}")

# # Identify the best-performing model based on accuracy
# best_accuracy_model = max(metrics_dict['accuracy'], key=metrics_dict['accuracy'].get)
# print(f"\nBest Model based on Accuracy: {best_accuracy_model} with Accuracy {metrics_dict['accuracy'][best_accuracy_model]:.4f}")

# # Identify the best-performing model based on F1-score
# best_f1_model = max(metrics_dict['f1_score'], key=metrics_dict['f1_score'].get)
# print(f"\nBest Model based on F1-score: {best_f1_model} with F1-score {metrics_dict['f1_score'][best_f1_model]:.4f}")


In [None]:
def analyze_results(metrics_dict):
    # Initialize variables to track best-performing model
    best_model = None
    best_accuracy = 0.0

    # Iterate through each model in the metrics dictionary
    for model_name, model_metrics in metrics_dict['accuracy'].items():
        # Check if 'accuracy' key is present in model_metrics
        if model_metrics is None:
            print(f"Error: 'accuracy' key not found for model: {model_name}")
            continue

        accuracy = model_metrics
        precision = metrics_dict['precision'].get(model_name)
        recall = metrics_dict['recall'].get(model_name)
        f1 = metrics_dict['f1_score'].get(model_name)
        confusion_matrix = metrics_dict['confusion_matrix'].get(model_name)

        # Interpretation of model predictions
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}" if precision is not None else "Precision: N/A")
        print(f"Recall: {recall:.4f}" if recall is not None else "Recall: N/A")
        print(f"F1-score: {f1:.4f}" if f1 is not None else "F1-score: N/A")
        print(f"Confusion Matrix:\n{confusion_matrix}\n" if confusion_matrix is not None else "Confusion Matrix: N/A\n")

        # Update best-performing model if necessary
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model_name

    return best_model, best_accuracy

# Assuming you have metrics_dict defined somewhere

# Call analyze_results to get the best model and accuracy
best_accuracy_model, best_accuracy = analyze_results(metrics_dict)

Model: KNN
Accuracy: 0.9403
Precision: 0.9411
Recall: 0.9403
F1-score: 0.9361
Confusion Matrix:
[[1139   57   44   23  173   97   91  121  109  101   42   24   57]
 [   0 2035    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0 2090    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0 2105    0    0    0    0    0    0    0    0    0]
 [   4    0    0    0 2079    0    2    0    2    0    0    0    1]
 [  23    1    1    0   61 1911   11    2   32    1    2    0    3]
 [   2    3    0    0    0    0 1965   12    4    0    0    0    0]
 [   3    0    0    0    0    0    7 2056    5    1    0    0    0]
 [   3    0    0    0    2    0    8    9 2070    0    0    0    0]
 [   2    0    0    0    0    0    0    0    3 2022    0    0    1]
 [   3    0    0    0    6    1    0    3    1    0 1994   74   15]
 [   2    0    0    0   14    0    0    0    0    1   95 1988   20]
 [  30    0    0    0   24    7    3    4    6    1   91   53 1832]]

Error: 'accuracy' 

In [None]:
# Identification of the best-performing model
if best_accuracy_model is not None:
    print(f"Best-performing model: {best_accuracy_model} (Accuracy: {best_accuracy:.4f})")
else:
    print("No models with accuracy information found.")

Best-performing model: NN (Accuracy: 0.9654)


Strengths and weaknesses

In [None]:
strengths_weaknesses = {
    'kNN': 'Strengths: Simple and easy to implement, but sensitive to outliers and requires careful selection of k.',
    'Linear Regression': 'Strengths: Interpretable model suitable for linear relationships, but sensitive to outliers and assumes linearity.',
    'Neural Networks': 'Strengths: Powerful and flexible for complex patterns, but requires large datasets, prone to overfitting, and lacks interpretability.',
    'SVM': 'Strengths: Effective in high-dimensional spaces with different kernels, but memory-intensive and requires careful hyperparameter tuning.'
}

for model, description in strengths_weaknesses.items():
        print(f"{model}: {description}")

kNN: Strengths: Simple and easy to implement, but sensitive to outliers and requires careful selection of k.
Linear Regression: Strengths: Interpretable model suitable for linear relationships, but sensitive to outliers and assumes linearity.
Neural Networks: Strengths: Powerful and flexible for complex patterns, but requires large datasets, prone to overfitting, and lacks interpretability.
SVM: Strengths: Effective in high-dimensional spaces with different kernels, but memory-intensive and requires careful hyperparameter tuning.


Insights into factors contributing to performance variation

In [None]:
print("\nInsights into factors contributing to performance variation:")
print("- k-Nearest Neighbors (kNN):")
print("  - Feature Engineering: Check if features are well-preprocessed.")
print("  - Hyperparameters: Tune k value for optimal performance.")
print("  - Data Quality: Address outliers and missing values.")
print("  - Model Architecture: Simple and effective for many cases.")
print("  - Sample Size: Larger datasets can improve accuracy.")

print("\n- Support Vector Machines (SVM):")
print("  - Feature Engineering: Optimize feature selection and scaling.")
print("  - Hyperparameters: Choose appropriate kernel and regularization parameters.")
print("  - Data Quality: Handle outliers and ensure clean data.")
print("  - Model Architecture: Effective for complex decision boundaries.")
print("  - Sample Size: Requires sufficient data for robust performance.")

print("\n- Neural Networks (NN):")
print("  - Feature Engineering: Data normalization and feature scaling are crucial.")
print("  - Hyperparameters: Tune learning rate, batch size, and network architecture.")
print("  - Data Quality: Clean and preprocess data to reduce noise.")
print("  - Model Architecture: Design deep architectures based on problem complexity.")
print("  - Sample Size: Deep learning often benefits from large datasets.")

print("\n- Logistic Regression:")
print("  - Feature Engineering: Encode categorical variables and handle missing values.")
print("  - Hyperparameters: Adjust regularization strength for model complexity.")
print("  - Data Quality: Ensure data quality and appropriate preprocessing.")
print("  - Model Architecture: Linear model suitable for binary classification tasks.")
print("  - Sample Size: Requires adequate samples per class for reliable predictions.")

print("\n- Linear Regression:")
print("  - Feature Engineering: Include relevant features and handle multicollinearity.")
print("  - Hyperparameters: Regularization can prevent overfitting.")
print("  - Data Quality: Clean data and handle outliers.")
print("  - Model Architecture: Linear relationship assumption between features and target.")
print("  - Sample Size: Requires enough data points for reliable regression results.")



Insights into factors contributing to performance variation:
- k-Nearest Neighbors (kNN):
  - Feature Engineering: Check if features are well-preprocessed.
  - Hyperparameters: Tune k value for optimal performance.
  - Data Quality: Address outliers and missing values.
  - Model Architecture: Simple and effective for many cases.
  - Sample Size: Larger datasets can improve accuracy.

- Support Vector Machines (SVM):
  - Feature Engineering: Optimize feature selection and scaling.
  - Hyperparameters: Choose appropriate kernel and regularization parameters.
  - Data Quality: Handle outliers and ensure clean data.
  - Model Architecture: Effective for complex decision boundaries.
  - Sample Size: Requires sufficient data for robust performance.

- Neural Networks (NN):
  - Feature Engineering: Data normalization and feature scaling are crucial.
  - Hyperparameters: Tune learning rate, batch size, and network architecture.
  - Data Quality: Clean and preprocess data to reduce noise.
  - 

##Conclusion

Recap of key findings and conclusions


In [None]:
print("\nRecap of Key Findings and Conclusions:")

# Strengths and Weaknesses Recap
print("\nStrengths and Weaknesses Recap:")
print("- k-Nearest Neighbors (kNN): Simple and easy to implement but sensitive to outliers.")
print("- Support Vector Machines (SVM): Effective in high-dimensional spaces but memory-intensive.")
print("- Neural Networks (NN): Powerful for complex patterns but requires large datasets.")
print("- Logistic Regression: Interpretable for binary classification but assumes linear relationship.")
print("- Linear Regression: Interpretable for linear relationships but sensitive to outliers.")

# Insights into Factors Recap
print("\nInsights into Factors Recap:")
print("- Feature Engineering: Critical for all models to preprocess data and handle outliers.")
print("- Hyperparameters: Importance of tuning parameters for optimal model performance.")
print("- Data Quality: Clean and quality data is fundamental for accurate predictions.")
print("- Model Architecture: Consideration of model complexity and suitability for the problem.")
print("- Sample Size: Larger datasets often lead to better model performance.")
print("- Domain-Specific Insights: Understanding the problem context aids in model selection.")

# Overall Conclusion
print("\nOverall Conclusion:")
print("Based on the comparative analysis, the choice of model depends on various factors including dataset size, complexity of the problem, interpretability requirements, and computational resources.")



Recap of Key Findings and Conclusions:

Strengths and Weaknesses Recap:
- k-Nearest Neighbors (kNN): Simple and easy to implement but sensitive to outliers.
- Support Vector Machines (SVM): Effective in high-dimensional spaces but memory-intensive.
- Neural Networks (NN): Powerful for complex patterns but requires large datasets.
- Logistic Regression: Interpretable for binary classification but assumes linear relationship.
- Linear Regression: Interpretable for linear relationships but sensitive to outliers.

Insights into Factors Recap:
- Feature Engineering: Critical for all models to preprocess data and handle outliers.
- Hyperparameters: Importance of tuning parameters for optimal model performance.
- Data Quality: Clean and quality data is fundamental for accurate predictions.
- Model Architecture: Consideration of model complexity and suitability for the problem.
- Sample Size: Larger datasets often lead to better model performance.
- Domain-Specific Insights: Understanding the

Summary of the best model for activity recognition based on the MHEALTH
dataset.

In [None]:
# Filter out None values (Linear Resgression)
filtered_accuracy = {k: v for k, v in metrics_dict['accuracy'].items() if v is not None}

# Find the best-performing model
if filtered_accuracy:
    best_accuracy_model = max(filtered_accuracy, key=filtered_accuracy.get)
    best_accuracy = filtered_accuracy[best_accuracy_model]
    precision = metrics_dict['precision'][best_accuracy_model]
    recall = metrics_dict['recall'][best_accuracy_model]
    f1_score = metrics_dict['f1_score'][best_accuracy_model]

    # Summary of the best model based on metrics
    print("\nSummary of the Best Model for Activity Recognition (MHEALTH Dataset):")
    print(f"- The best-performing model based on accuracy was found to be {best_accuracy_model} with an accuracy of {best_accuracy:.4f}.")
    print(f"- {best_accuracy_model} showed high precision ({precision:.4f}), recall ({recall:.4f}), and F1-score ({f1_score:.4f}), indicating overall good performance.")
    print("- Key factors contributing to the model's success include its ability to learn complex patterns, adapt to non-linear relationships, and handle high-dimensional data.")
    print("- Feature engineering, hyperparameter tuning, and data preprocessing played crucial roles in optimizing the model's performance.")
    print(f"- {best_accuracy_model}'s interpretability may be a concern, but its predictive power and performance metrics outweigh this limitation for activity recognition in the MHEALTH dataset.")
else:
    print("No valid accuracy values found in the metrics dictionary.")



Summary of the Best Model for Activity Recognition (MHEALTH Dataset):
- The best-performing model based on accuracy was found to be NN with an accuracy of 0.9654.
- NN showed high precision (0.9499), recall (0.9513), and F1-score (0.9494), indicating overall good performance.
- Key factors contributing to the model's success include its ability to learn complex patterns, adapt to non-linear relationships, and handle high-dimensional data.
- Feature engineering, hyperparameter tuning, and data preprocessing played crucial roles in optimizing the model's performance.
- NN's interpretability may be a concern, but its predictive power and performance metrics outweigh this limitation for activity recognition in the MHEALTH dataset.
