# Construction du classifieur


In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import random
import pandas as pd
import cv2
import os

from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    average_precision_score,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Récupération des jeux


In [2]:
f_labels = [
    f
    for f in os.listdir(os.path.join("data", "train", "labels_csv"))
    if f.endswith(".csv")
]
print(f_labels)


img_pos_train = []


img_neg_train = []


bbox_train = []


for f in f_labels:

    try:

        img_pos = plt.imread(
            os.path.join("data", "train", "images", "pos", f.replace(".csv", ".jpg"))
        )
        img_pos_train.append(img_pos)

    except FileNotFoundError:

        continue

    try:

        img_neg = plt.imread(
            os.path.join("data", "train", "images", "neg", f.replace(".csv", ".jpg"))
        )
        img_neg_train.append(img_neg)

    except FileNotFoundError:

        continue

    bbox = np.loadtxt(os.path.join("data", "train", "labels_csv", f), delimiter=",")

    bbox_train.append(bbox)

['0000.csv', '0001.csv', '0002.csv', '0003.csv', '0004.csv', '0005.csv', '0006.csv', '0007.csv', '0008.csv', '0009.csv', '0010.csv', '0011.csv', '0012.csv', '0013.csv', '0014.csv', '0015.csv', '0016.csv', '0017.csv', '0018.csv', '0019.csv', '0020.csv', '0021.csv', '0022.csv', '0023.csv', '0024.csv', '0025.csv', '0026.csv', '0027.csv', '0028.csv', '0029.csv', '0030.csv', '0031.csv', '0032.csv', '0033.csv', '0034.csv', '0035.csv', '0036.csv', '0037.csv', '0038.csv', '0039.csv', '0040.csv', '0041.csv', '0042.csv', '0043.csv', '0044.csv', '0045.csv', '0046.csv', '0047.csv', '0048.csv', '0049.csv', '0050.csv', '0051.csv', '0052.csv', '0053.csv', '0054.csv', '0055.csv', '0056.csv', '0057.csv', '0058.csv', '0059.csv', '0060.csv', '0061.csv', '0062.csv', '0063.csv', '0064.csv', '0065.csv', '0066.csv', '0067.csv', '0068.csv', '0069.csv', '0070.csv', '0071.csv', '0072.csv', '0073.csv', '0074.csv', '0075.csv', '0076.csv', '0077.csv', '0078.csv', '0079.csv', '0080.csv', '0081.csv', '0082.csv', '00

## Découper les écocups et les négatifs


## Obtenir des features


## Diviser les jeux pour obtenir un jeu de validation


## Choix du classifieur


Nous allons choisir le classifieur le plus efficace parmis un certain nombre de classifieurs


In [None]:
def test_model(
    name, model, X_train, y_train, X_validation, y_validation, df_resultat_clf
):

    print(f"Testing {name}...")

    model.fit(X_train, y_train)

    y_pred = model.predict(X_validation)

    accuracy = accuracy_score(y_validation, y_pred)

    error = (1 - accuracy) * 100
    # Calculer le rappel et la précision
    rappel = recall_score(y_validation, y_pred)
    precision = precision_score(y_validation, y_pred)
    f1_score = 2 * (precision * rappel) / (precision + rappel)
    average_precision_score = average_precision_score(y_validation, y_pred)

    df_resultat_clf = pd.concat(
        [
            df_resultat_clf,
            pd.DataFrame(
                [
                    {
                        "model": name,
                        "accuracy": accuracy,
                        "error%": error,
                        "rappel": rappel,
                        "precision": precision,
                        "f1_score": f1_score,
                        "average_precision_score": average_precision_score,
                    }
                ]
            ),
        ],
        ignore_index=True,
    )
    return df_resultat_clf

In [None]:
def get_models_dict():

    models_dict = {}
    # KNN
    neighbors_list = [3, 5, 10, 15, 20, 25, 50, 75, 100, 150, 200, 250, 300]
    for neighbors in neighbors_list:
        knn = KNeighborsClassifier(n_neighbors=neighbors)
        models_dict[f"KNN (k={neighbors})"] = knn

    # Decision Tree
    decision_tree = DecisionTreeClassifier()
    models_dict["Decision Tree"] = decision_tree

    # Random Forest
    estimators_list = [3, 5, 10, 15, 20, 25, 50, 75, 100, 150, 200, 250, 300]
    for estimators in estimators_list:
        random_forest = RandomForestClassifier(n_estimators=estimators)
        models_dict[f"Random Forest (n_estimators={estimators})"] = random_forest

    # SVC
    svc = SVC()
    models_dict["SVC"] = svc

    # LinearSVC
    linear_svc = LinearSVC()
    models_dict["Linear SVC"] = linear_svc

    # SVC kernel
    svc_kernel_list = ["poly", "rbf", "sigmoid"]
    for kernel in svc_kernel_list:
        svc_kernel = SVC(kernel=kernel)
        models_dict[f"SVC (kernel={kernel})"] = svc_kernel

    # Logistic Regression
    iter_list = [25, 50, 100, 200, 300, 400, 500]
    for n_iter in iter_list:
        logistic_regression = LogisticRegression(max_iter=n_iter)
        models_dict[f"Logistic Regression (max_iter={n_iter})"] = logistic_regression

    # AdaBoost
    estimators_list = [10, 25, 50, 100, 150, 200, 250]
    for estimators in estimators_list:
        ada_boost = AdaBoostClassifier(n_estimators=estimators)
        models_dict[f"AdaBoost (n_estimators={estimators})"] = ada_boost

    # Gradient Boosting
    learning_rate_list = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
    for learning_rate in learning_rate_list:
        gradient_boosting = GradientBoostingClassifier(learning_rate=learning_rate)
        models_dict[f"Gradient Boosting (learning_rate={learning_rate})"] = (
            gradient_boosting
        )

    return models_dict

In [None]:
# df_resultat_clf = pd.DataFrame(
#     columns=["model", "accuracy", "error%", "rappel", "precision", "f1_score", "average_precision_score"]
# )

# models_dict = get_models_dict()
# for name, model in models_dict.items():
#     df_resultat_clf = test_model(
#         name, model, X_train, y_train, X_validation, y_validation, df_resultat_clf
#     )

# df_resultat_clf = df_resultat_clf.sort_values(by="error%")
# print(df_resultat_clf)