In [3]:
import pandas as pd
import numpy as np
import pywt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# -----------------------------
# Custom Wavelet Transformer
# -----------------------------
class WaveletTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, wavelet='db4', level=3):
        self.wavelet = wavelet
        self.level = level

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        transformed = X.apply(lambda row: self._apply_dwt(row), axis=1, result_type='expand')
        return transformed

    def _apply_dwt(self, row):
        coeffs = pywt.wavedec(row, wavelet=self.wavelet, level=self.level)
        return np.concatenate(coeffs)

# -----------------------------
# Load Dataset
# -----------------------------
df = pd.read_csv("../data/preprocessed.csv")
X = df.drop(df.columns[0], axis=1)
y = df.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# Models & Parameters
# -----------------------------
models = {
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "ANN": MLPClassifier(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

param_grids = {
    "KNN": {
        "clf__n_neighbors": [3, 5, 7, 9],
        "clf__weights": ["uniform", "distance"],
        "clf__metric": ["euclidean", "manhattan"]
    },
    "Decision Tree": {
        "clf__max_depth": [3, 5, 10, 20, None],
        "clf__min_samples_split": [2, 5, 10, 20],
        "clf__min_samples_leaf": [1, 2, 4, 8]
    },
    "Random Forest": {
        "clf__n_estimators": [50, 100],
        "clf__max_depth": [10, 20, None],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4]
    },
    "ANN": {
        "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
        "clf__activation": ["relu", "tanh"],
        "clf__solver": ["adam", "sgd"],
        "clf__alpha": [0.0001, 0.001],
        "clf__learning_rate": ["constant", "adaptive"]
    },
    "XGBoost": {
        "clf__max_depth": [4, 6, 8],
        "clf__learning_rate": [0.01, 0.1, 0.2],
        "clf__n_estimators": [100, 200]
    }
}

# Wavelet parameters common to all
wavelet_grid = {
    'wavelet__wavelet': ['db1', 'db2', 'db3', 'db4', 'sym2'],
    'wavelet__level': [1, 2, 3]
}

# -----------------------------
# Train and Evaluate Each Model
# -----------------------------
results = {}

print("🔍 Starting Grid Search for Each Model...\n")

for name, model in models.items():
    print(f"▶️ {name}")
    
    # Combine wavelet and model-specific parameters
    combined_grid = {**wavelet_grid, **param_grids.get(name, {})}

    pipeline = Pipeline([
        ('wavelet', WaveletTransformer()),
        ('clf', model)
    ])

    grid = GridSearchCV(
        pipeline, combined_grid, cv=3,
        scoring='accuracy', n_jobs=-1, verbose=1
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    results[name] = {
        "accuracy": acc,
        "best_params": grid.best_params_
    }

# -----------------------------
# Display Results
# -----------------------------
print("\n📊 Final Model Results with Best Hyperparameters:\n")
for model_name, res in results.items():
    print(f"{model_name}")
    print(f"  Accuracy       = {res['accuracy']:.4f}")
    print(f"  Best Params    =")
    for param, value in res['best_params'].items():
        print(f"    - {param}: {value}")
    print()

🔍 Starting Grid Search for Each Model...

▶️ KNN
Fitting 3 folds for each of 240 candidates, totalling 720 fits
▶️ Decision Tree
Fitting 3 folds for each of 1200 candidates, totalling 3600 fits
▶️ Random Forest
Fitting 3 folds for each of 810 candidates, totalling 2430 fits
▶️ ANN
Fitting 3 folds for each of 720 candidates, totalling 2160 fits
▶️ XGBoost
Fitting 3 folds for each of 270 candidates, totalling 810 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



📊 Final Model Results with Best Hyperparameters:

KNN
  Accuracy       = 0.3610
  Best Params    =
    - clf__metric: manhattan
    - clf__n_neighbors: 9
    - clf__weights: distance
    - wavelet__level: 1
    - wavelet__wavelet: db4

Decision Tree
  Accuracy       = 0.4370
  Best Params    =
    - clf__max_depth: 10
    - clf__min_samples_leaf: 2
    - clf__min_samples_split: 20
    - wavelet__level: 1
    - wavelet__wavelet: db1

Random Forest
  Accuracy       = 0.5490
  Best Params    =
    - clf__max_depth: None
    - clf__min_samples_leaf: 1
    - clf__min_samples_split: 5
    - clf__n_estimators: 100
    - wavelet__level: 2
    - wavelet__wavelet: db4

ANN
  Accuracy       = 0.4050
  Best Params    =
    - clf__activation: relu
    - clf__alpha: 0.001
    - clf__hidden_layer_sizes: (100,)
    - clf__learning_rate: constant
    - clf__solver: adam
    - wavelet__level: 1
    - wavelet__wavelet: db4

XGBoost
  Accuracy       = 0.5640
  Best Params    =
    - clf__learning_rate: 0