# Import Libraries

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score,precision_score,recall_score


# Data Loading and Filtering
# Train-Test Split

In [2]:
df_train = pd.read_csv("../fire_test/train_data_preprocessed.csv")
df_test = pd.read_csv("../fire_test/test_data_preprocessed.csv")

try:
    drop_cols = ['class']
    df_train = df_train.drop(columns=["latitude", "longitude"], errors='ignore')
    df_test = df_test.drop(columns=["latitude", "longitude"], errors='ignore')
    

    y_train = df_train['class']
    X_train = df_train.drop(columns=drop_cols, errors='ignore')
    y_test = df_test['class']
    X_test = df_test.drop(columns=drop_cols, errors='ignore')
except FileNotFoundError:
    print("Erreur: Le fichier ../fire_test/result.csv est introuvable.")



# Train-Test Split

In [3]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data Balancing Functions

In [4]:
import pandas as pd
from collections import Counter
def random_undersampling(X, y):
    #Réduit la classe majoritaire pour égaler la classe minoritaire"""
    data = pd.concat([X, y], axis=1)
    count_class_0, count_class_1 = data['class'].value_counts()
    
    df_class_0 = data[data['class'] == 0]
    df_class_1 = data[data['class'] == 1]
    n_min = min(count_class_0, count_class_1)
    df_class_0_under = df_class_0.sample(n_min, random_state=42)
    df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)
    print(f"Après Undersampling: {Counter(df_test_under['class'])}")
    return df_test_under.drop('class', axis=1), df_test_under['class']

def random_oversampling(X, y):
    #Duplique la classe minoritaire pour égaler la classe majoritaire
    data = pd.concat([X, y], axis=1)
    count_class_0, count_class_1 = data['class'].value_counts()
    df_class_0 = data[data['class'] == 0]
    df_class_1 = data[data['class'] == 1]
    n_max = max(count_class_0, count_class_1)
    df_class_1_over = df_class_1.sample(n_max, replace=True, random_state=42)
    df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
    
    print(f"Après Oversampling: {Counter(df_test_over['class'])}")
    return df_test_over.drop('class', axis=1), df_test_over['class']

print(f"Avant : {Counter(y_train)}")
X_train_over, y_train_over = random_oversampling(X_train, y_train)
X_train_under, y_train_under = random_undersampling(X_train, y_train)



Avant : Counter({0: 177055, 1: 6226})
Après Oversampling: Counter({0: 177055, 1: 177055})
Après Undersampling: Counter({0: 6226, 1: 6226})


# SMOTE Oversampling

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"Après SMOTE: {Counter(y_train_smote)}")  

found 0 physical cores < 1
  File "c:\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Après SMOTE: Counter({0: 177055, 1: 177055})


# KNN Implementation from Scratch

In [6]:
from collections import Counter
import numpy as np 
class KNN_Scratch:
    def __init__(self, k=3):
        self.k = k
    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        X = np.array(X)
        predicted_labels = [self._predict(x) for x in X]
        return np.array(predicted_labels)

    def _predict(self, x):
        distances = [np.sqrt(np.sum((x_train - x) ** 2)) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [8]:
knn_scratch = KNN_Scratch(k=3)
knn_scratch.fit(X_train_smote, y_train_smote)
y_pred_scratch = knn_scratch.predict(X_test)
print("Évaluation du KNN implémenté à partir de zéro avec SMOTE:")
print(classification_report(y_test, y_pred_scratch))


KeyboardInterrupt: 

# Model Evaluation

In [None]:
def evaluate_knn(X_tr, y_tr, X_te, y_te):
    k_values = [3,5]
    f1_scores = []

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_tr, y_tr)

        # Prédictions
        y_pred = knn.predict(X_te)
        y_proba = knn.predict_proba(X_te)[:, 1]

        # Métriques
        acc = accuracy_score(y_te, y_pred)
        prec = precision_score(y_te, y_pred, pos_label=1)
        rec = recall_score(y_te, y_pred, pos_label=1)
        f1 = f1_score(y_te, y_pred, pos_label=1)
        roc_auc = roc_auc_score(y_te, y_proba)

        f1_scores.append(f1)

        print(f"k={k}")
        print(f"  Accuracy  : {acc:.4f}")
        print(f"  Precision : {prec:.4f}")
        print(f"  Recall    : {rec:.4f}")
        print(f"  F1-score  : {f1:.4f}")
        print(f"  ROC-AUC   : {roc_auc:.4f}")
        print("-" * 30)

    best_k = k_values[np.argmax(f1_scores)]
    print(f"Meilleur k retenu : {best_k}")

    final_knn = KNeighborsClassifier(n_neighbors=best_k)
    final_knn.fit(X_tr, y_tr)

    return final_knn


model = evaluate_knn(X_tr=X_train_under,y_tr=y_train_under,X_te=X_test,y_te=y_test)



k=3
  Accuracy  : 0.9155
  Precision : 0.2775
  Recall    : 0.9261
  F1-score  : 0.4270
  ROC-AUC   : 0.9618
------------------------------
k=5
  Accuracy  : 0.9073
  Precision : 0.2581
  Recall    : 0.9216
  F1-score  : 0.4033
  ROC-AUC   : 0.9660
------------------------------
Meilleur k retenu : 3


In [9]:
evaluate_knn(X_tr=X_train_over,y_tr=y_train_over,X_te=X_test,y_te=y_test)
evaluate_knn(X_tr=X_train_smote,y_tr=y_train_smote,X_te=X_test,y_te=y_test)

k=3
  Accuracy  : 0.9904
  Precision : 0.8344
  Recall    : 0.8934
  F1-score  : 0.8629
  ROC-AUC   : 0.9478
------------------------------
k=5
  Accuracy  : 0.9873
  Precision : 0.7677
  Recall    : 0.8998
  F1-score  : 0.8285
  ROC-AUC   : 0.9542
------------------------------
Meilleur k retenu : 3
k=3
  Accuracy  : 0.9874
  Precision : 0.7663
  Recall    : 0.9037
  F1-score  : 0.8294
  ROC-AUC   : 0.9525
------------------------------
k=5
  Accuracy  : 0.9839
  Precision : 0.7042
  Recall    : 0.9082
  F1-score  : 0.7933
  ROC-AUC   : 0.9573
------------------------------
Meilleur k retenu : 3
