<h1>Classification</h1>

<h2 align="center">Support Vector Machines</h2><h3>Parameters study</h3>

In [None]:
from numpy import array, ndarray
from matplotlib.pyplot import figure, savefig, show
from sklearn.svm import SVC
from dslabs_functions import (
    CLASS_EVAL_METRICS,
    DELTA_IMPROVE,
    read_train_test_from_files,
)
from dslabs_functions import plot_evaluation_results, plot_multiline_chart


def svm_study(
    trnX: ndarray,
    trnY: array,
    tstX: ndarray,
    tstY: array,
    nr_max_iterations: int = 2500,
    lag: int = 500,
    metric: str = "accuracy",
) -> tuple[SVC | None, dict]:
    nr_iterations: list[int] = [100] + [
        i for i in range(500, nr_max_iterations + 1, lag)
    ]

    kernel_types: list[str] = ["linear", "poly", "rbf", "sigmoid"]
    poly_degrees: list[int] = [2, 3, 4]

    best_model = None
    best_params: dict = {"name": "SVM", "metric": metric, "params": ()}
    best_performance: float = 0.0

    values: dict = {}
    for kernel in kernel_types:
        degrees: list[int] = poly_degrees if "poly" == kernel else [0]
        for d in degrees:
            kernel_name: str = f"poly_{d}" if "poly" == kernel else kernel
            y_tst_values: list[float] = []
            for n in nr_iterations:
                clf = SVC(kernel=kernel, max_iter=n, degree=d, verbose=False)
                clf.fit(trnX, trnY)
                prdY: array = clf.predict(tstX)
                eval: float = CLASS_EVAL_METRICS[metric](tstY, prdY)
                y_tst_values.append(eval)
                if eval - best_performance > DELTA_IMPROVE:
                    best_performance = eval
                    best_params["params"] = (kernel, n, d)
                    best_model = clf
                # print(f'SVM lr_type={kernel_name} n={n} -> {eval}')
            values[kernel_name] = y_tst_values
    plot_multiline_chart(
        nr_iterations,
        values,
        title=f"SVM models ({metric})",
        xlabel="nr iterations",
        ylabel=metric,
        percentage=True,
    )
    best_kernel = best_params["params"][0]
    kernel_name = (
        f'poly_{best_params["params"][2]}' if "poly" == best_kernel else best_kernel
    )
    print(f'SVM best for {kernel_name} and n={best_params["params"][1]}')

    return best_model, best_params


file_tag = "stroke"
train_filename = "data/stroke_train_smote.csv"
test_filename = "data/stroke_test.csv"
target = "stroke"
eval_metric = "accuracy"

trnX, tstX, trnY, tstY, labels, vars = read_train_test_from_files(
    train_filename, test_filename, target
)
print(f"Train#={len(trnX)} Test#={len(tstX)}")
print(f"Labels={labels}")

figure()
best_model, params = svm_study(
    trnX,
    trnY,
    tstX,
    tstY,
    nr_max_iterations=5000,
    lag=500,
    metric=eval_metric,
)
savefig(f"images/{file_tag}_svm_{eval_metric}_study.png")
show()

<h3>Best model performance</h3>

In [None]:
prd_trn: array = best_model.predict(trnX)
prd_tst: array = best_model.predict(tstX)
figure()
plot_evaluation_results(params, trnY, prd_trn, tstY, prd_tst, labels)
savefig(f'images/{file_tag}_svm_{params["name"]}_best_{params["metric"]}_eval.png')
show()

<h3>Overfitting study</h3>

In [None]:
kernel: str = params["params"][0]
degree: int = params["params"][2]
kernel_name: str = f'poly (d={params["params"][2]})' if "poly" == kernel else kernel
nr_iterations: list[int] = [100] + [i for i in range(500, 5001, 500)]

y_tst_values: list[float] = []
y_trn_values: list[float] = []
acc_metric: str = "accuracy"

warm_start: bool = False
for n in nr_iterations:
    clf = SVC(kernel=kernel, max_iter=n, degree=degree, verbose=False)
    clf.fit(trnX, trnY)
    prd_tst_Y: array = clf.predict(tstX)
    prd_trn_Y: array = clf.predict(trnX)
    y_tst_values.append(CLASS_EVAL_METRICS[acc_metric](tstY, prd_tst_Y))
    y_trn_values.append(CLASS_EVAL_METRICS[acc_metric](trnY, prd_trn_Y))
    warm_start = True

figure()
plot_multiline_chart(
    nr_iterations,
    {"Train": y_trn_values, "Test": y_tst_values},
    title=f"SVM overfitting study for {kernel_name}",
    xlabel="nr_iterations",
    ylabel=str(eval_metric),
    percentage=True,
)
savefig(f"images/{file_tag}_svm_{eval_metric}_overfitting.png")