In [None]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [9]:
# a3
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint, uniform

# ---------------- Step 1: Load Dataset ----------------
def load_dataset():
    """
    Replace this with your dataset loading logic.
    Using Iris dataset as demo.
    """
    from sklearn.datasets import load_iris
    iris = load_iris()
    X, y = iris.data, iris.target
    return X, y


# ---------------- Step 2: Split Data ----------------
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)


# ---------------- Step 3: Tune Hyperparameters ----------------
def tune_hyperparameters(model, param_distributions, X_train, y_train, cv=3, n_iter=10):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring="accuracy",
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_


# ---------------- Step 4: Evaluate Model ----------------
def evaluate_model(model, X_train, y_train, X_test, y_test):
    metrics = {}
    for split, (X, y) in {"Train": (X_train, y_train), "Test": (X_test, y_test)}.items():
        y_pred = model.predict(X)
        metrics[f"Accuracy_{split}"] = accuracy_score(y, y_pred)
        metrics[f"Precision_{split}"] = precision_score(y, y_pred, average="weighted", zero_division=0)
        metrics[f"Recall_{split}"] = recall_score(y, y_pred, average="weighted", zero_division=0)
        metrics[f"F1_{split}"] = f1_score(y, y_pred, average="weighted", zero_division=0)
    return metrics


# ---------------- Step 5: Define Models & Hyperparams ----------------
def get_models_and_params():
    return {
        "DecisionTree": (DecisionTreeClassifier(), {
            "max_depth": randint(1, 10),
            "min_samples_split": randint(2, 20),
            "criterion": ["gini", "entropy"]
        }),
        "RandomForest": (RandomForestClassifier(), {
            "n_estimators": randint(50, 150),
            "max_depth": randint(2, 10)
        }),
        "SVM": (SVC(), {
            "C": uniform(loc=0.1, scale=10),
            "kernel": ["linear", "rbf", "poly"],
            "gamma": ["scale", "auto"]
        }),
        "KNN": (KNeighborsClassifier(), {
            "n_neighbors": randint(1, 20),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        }),
        "NaiveBayes": (GaussianNB(), {}),  # no hyperparams
        "MLP": (MLPClassifier(max_iter=500), {
            "hidden_layer_sizes": [(50,), (100,), (100, 50)],
            "activation": ["relu", "tanh"],
            "alpha": uniform(loc=0.0001, scale=0.01)
        }),
        "AdaBoost": (AdaBoostClassifier(), {
            "n_estimators": randint(50, 150),
            "learning_rate": uniform(loc=0.01, scale=1)
        })
    }


# ---------------- Step 6: Run All Models ----------------
def demo():
    X, y = load_dataset()
    X_train, X_test, y_train, y_test = split_data(X, y)

    models_and_params = get_models_and_params()
    results = []

    for name, (model, params) in models_and_params.items():
        print(f"\nTraining {name}...")
        if params:  # Tune if hyperparams exist
            model, best_params = tune_hyperparameters(model, params, X_train, y_train)
            print(f"Best Params for {name}: {best_params}")
        else:  # No hyperparams for NB
            model.fit(X_train, y_train)

        metrics = evaluate_model(model, X_train, y_train, X_test, y_test)
        metrics["Model"] = name
        results.append(metrics)

    results_df = pd.DataFrame(results)
    print("\nFinal Results Table:\n", results_df)
    return results_df


if __name__ == "__main__":
    results_df = demo()



Training DecisionTree...
Best Params for DecisionTree: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 16}

Training RandomForest...
Best Params for RandomForest: {'max_depth': 4, 'n_estimators': 121}

Training SVM...
Best Params for SVM: {'C': np.float64(0.6808361216819946), 'gamma': 'auto', 'kernel': 'linear'}

Training KNN...
Best Params for KNN: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}

Training NaiveBayes...

Training MLP...
Best Params for MLP: {'activation': 'relu', 'alpha': np.float64(0.007896910002727693), 'hidden_layer_sizes': (50,)}

Training AdaBoost...
Best Params for AdaBoost: {'learning_rate': np.float64(0.8761761457749352), 'n_estimators': 149}

Final Results Table:
    Accuracy_Train  Precision_Train  Recall_Train  F1_Train  Accuracy_Test  \
0        0.983333         0.984127      0.983333  0.983323       0.966667   
1        0.991667         0.991870      0.991667  0.991665       0.966667   
2        0.983333         0.984127      0.983333  0.983