In [1]:
# Cell 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Cell 2: Load & inspect data
file_path = "steps_tracker_dataset_cleaned.csv"  
df = pd.read_csv(file_path)

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   steps                493 non-null    float64
 1   distance_km          493 non-null    float64
 2   calories_burned      493 non-null    float64
 3   active_minutes       493 non-null    float64
 4   sleep_hours          493 non-null    float64
 5   water_intake_liters  493 non-null    float64
 6   mood                 493 non-null    object 
 7   date                 493 non-null    object 
dtypes: float64(6), object(2)
memory usage: 30.9+ KB
None
          steps  distance_km  calories_burned  active_minutes  sleep_hours  \
0  12052.000000         8.32           448.90           139.0     7.108909   
1   8431.243848         0.24             9.69             3.0     8.100000   
2   7331.000000         4.30           277.60            98.0     6.700000   
3   3357.000000         4.

In [3]:
# Cell 3: Preprocessing
# 1) Drop 'date'
# 2) Encode 'mood' to numeric labels
df = df.drop(columns=['date'])
le = LabelEncoder()
df['mood_encoded'] = le.fit_transform(df['mood'])

# Features and target
X = df.drop(columns=['mood', 'mood_encoded'])
y = df['mood_encoded']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
# Cell 4: Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "CART (Gini)": DecisionTreeClassifier(criterion='gini'),
    "C4.5 (Entropy)": DecisionTreeClassifier(criterion='entropy'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}


In [5]:
# Cell 5: Train & Evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc*100:.4f}")

# Optional: detailed report for the best model
best_model_name = max(results, key=results.get)
print("\nBest model:", best_model_name)

best = models[best_model_name]
y_pred_best = best.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=le.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 13.1313
Naive Bayes Accuracy: 10.1010
CART (Gini) Accuracy: 13.1313
C4.5 (Entropy) Accuracy: 12.1212
Random Forest Accuracy: 10.1010
KNN Accuracy: 9.0909

Best model: Logistic Regression

Classification Report:
              precision    recall  f1-score   support

        calm       0.14      0.17      0.15        12
   energetic       0.13      0.31      0.18        13
       happy       0.25      0.09      0.13        11
   motivated       0.00      0.00      0.00        10
     relaxed       0.13      0.27      0.18        15
         sad       0.13      0.15      0.14        13
    stressed       0.00      0.00      0.00        11
       tired       0.00      0.00      0.00        14

    accuracy                           0.13        99
   macro avg       0.10      0.12      0.10        99
weighted avg       0.10      0.13      0.10        99


Confusion Matrix:
[[2 2 1 0 5 2 0 0]
 [2 4 0 0 3 2 0 2]
 [1 4 1 0 3 1 0 1]
 [2 2 0 0 5 1 0 0]
 [3 3 1 0 4 4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
