In [1]:
import pandas as pd
import numpy as np
import duckdb as ddb
import seaborn
import sklearn
import imblearn
from time import time

In [3]:
from google.colab import files
uploaded = files.upload()

Saving bank_data.parquet to bank_data.parquet


In [4]:
data = pd.read_parquet("bank_data.parquet")

In [5]:
data["AvgDiffDate"] = data[["DiffDateTr1","DiffDateTr2","DiffDateTr3"]].mean(axis=1)

In [6]:
imp_vars = ["Montant","D2CB","CA3TRetMtt","TauxImpNb_RB","TauxImpNB_CPM","Heure","VerifianceCPT2","ScoringFP1","ScoringFP2","ScoringFP3","DateTransaction","FlagImpaye"]
data = data[imp_vars]

In [7]:
X_train = ddb.query("SELECT * EXCLUDE (DateTransaction,FlagImpaye) FROM data WHERE DateTransaction <= '2017-08-31'").df()
X_test = ddb.query("SELECT * EXCLUDE (DateTransaction,FlagImpaye) FROM data WHERE DateTransaction > '2017-08-31'").df()
y_train = np.array(ddb.query("SELECT FlagImpaye FROM data WHERE DateTransaction <= '2017-08-31'").df()["FlagImpaye"])
y_test = np.array(ddb.query("SELECT FlagImpaye FROM data WHERE DateTransaction > '2017-08-31'").df()["FlagImpaye"])

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# # Calculer les poids inversement proportionnels à la fréquence de classe
# train_class_weights = {
#     0: pd.Series(y_train).value_counts(normalize=True).iloc[1],
#     1: pd.Series(y_train).value_counts(normalize=True).iloc[0]}  # Basé sur le ratio 0.05% / 99.95%
# train_sample_weights = np.array([train_class_weights[int(label)] for label in y_train])

# test_class_weights = {
#     0: pd.Series(y_test).value_counts(normalize=True).iloc[1],
#     1: pd.Series(y_test).value_counts(normalize=True).iloc[0]}  # Basé sur le ratio 0.05% / 99.95%
# test_sample_weights = np.array([test_class_weights[int(label)] for label in y_test])

In [15]:
from sklearn.utils.class_weight import compute_class_weight
# Calcul des poids des classes
train_class_weights = compute_class_weight('balanced', classes=np.unique(y_train.astype(int)), y=y_train.astype(int))
train_sample_weights = np.ones(len(y_train))
train_sample_weights[y_train.astype(int) == 0] = train_class_weights[0]  # Attribuer un poids plus élevé à la classe minoritaire
train_sample_weights[y_train.astype(int) == 1] = train_class_weights[1]  # Attribuer un poids plus élevé à la classe minoritaire

test_class_weights = compute_class_weight('balanced', classes=np.unique(y_test.astype(int)), y=y_test.astype(int))
test_sample_weights = np.ones(len(y_test))
test_sample_weights[y_test.astype(int) == 0] = test_class_weights[0]  # Attribuer un poids plus élevé à la classe minoritaire
test_sample_weights[y_test.astype(int) == 1] = test_class_weights[1]  # Attribuer un poids plus élevé à la classe minoritaire

### K-MEANS pondéré

In [10]:
import numpy as np
from sklearn.cluster import KMeans

In [11]:
class WeightedKMeans(KMeans):
    def fit(self, X, sample_weight=None):
        self.sample_weight = sample_weight
        return super().fit(X, sample_weight=sample_weight)

    def _e_step(self, X):
        weight = self.sample_weight[:, np.newaxis] if self.sample_weight is not None else 1.0
        return super()._e_step(X * np.sqrt(weight))

In [12]:
def find_best_mapping(true, pred):
    classes = np.unique(true)
    clusters = np.unique(pred)
    mapping = {}
    for cluster in clusters:
        mask = pred == cluster
        counts = np.bincount(true[mask])
        mapping[cluster] = counts.argmax()
    return np.array([mapping[c] for c in pred])

In [13]:
from sklearn.preprocessing import LabelEncoder
def validation(model, X, y, sample_weights, kf, scoring):
  # Liste pour stocker les scores de silhouette
  train_scores = []
  val_scores = []

  # Effectuer la validation croisée
  for train_index, val_index in kf.split(X,y):
      # Étape 1 : Découper selon les indices des folds
      x_train, x_val = X[train_index], X[val_index]
      y_train, y_val = y[train_index].astype(int), y[val_index].astype(int)
      train_weights, val_weights = sample_weights[train_index], sample_weights[val_index]

      # Étape 2 : Entraîner le modèle sur les données d'entraînement
      if hasattr(model, "sample_weight"):
        model.fit(x_train, sample_weight=train_weights)
      else:
        model.fit(x_train)

      # Étape 3 : Prédire les clusters pour les données de validation
      train_cluster_labels = model.predict(x_train)
      val_cluster_labels = model.predict(x_val)

      # Étape 4 : Encoder les étiquettes réelles et prédites
      le = LabelEncoder()
      train_true_encoded = le.fit_transform(y_train)
      val_true_encoded = le.transform(y_val)

      train_cluster_encoded = le.transform(train_cluster_labels)
      val_cluster_encoded = le.transform(val_cluster_labels)
      # Étape 5 : Trouver la meilleure correspondance entre clusters et classes réelles
      train_mapped_labels = find_best_mapping(train_true_encoded, train_cluster_encoded)
      val_mapped_labels = find_best_mapping(val_true_encoded, val_cluster_encoded)
      # Etape 6 : Calculer la métrique
      score = scoring(train_true_encoded, train_mapped_labels, average='weighted', sample_weight=train_weights)
      train_scores.append(score)
      score = scoring(val_true_encoded, val_mapped_labels, average='weighted', sample_weight=val_weights)
      val_scores.append(score)

  # Étape 7 : Calculer la moyenne et l'écart-type des scores de silhouette
  train_mean_score = np.mean(train_scores)
  train_std_score = np.std(train_scores)
  val_mean_score = np.mean(val_scores)
  val_std_score = np.std(val_scores)
  return train_mean_score, train_std_score, val_mean_score, val_std_score

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Définir le nombre de clusters et le nombre de plis pour la validation croisée
n_clusters = 2
n_splits = 5

# Initialiser le modèle de clustering (KMeans dans cet exemple)
weighted_kmeans = WeightedKMeans(n_clusters=n_clusters, random_state=0)

# Créer l'objet KFold pour la validation croisée
kf = StratifiedKFold(n_splits=n_splits)

start = time()
train_mean_score, train_std_score, val_mean_score, val_std_score = validation(weighted_kmeans, X_train_scaled, y_train, sample_weights=train_sample_weights, kf=kf, scoring=f1_score)
end = time()

print(f"F1-Score moyen (train): {train_mean_score:.3f} (+/- {train_std_score:.3f})")
print(f"F1-Score moyen (val): {val_mean_score:.3f} (+/- {val_std_score:.3f})")
print(f"Temps de calcul: {end-start}")

F1-Score moyen (train): 0.337 (+/- 0.002)
F1-Score moyen (val): 0.337 (+/- 0.002)
Temps de calcul: 31.046574115753174


#### Métriques de classification

In [17]:
# Création et entraînement du modèle
n_clusters = 2  # Ajustez selon vos besoins
weighted_kmeans = WeightedKMeans(n_clusters=n_clusters, random_state=0)
weighted_kmeans.fit(X_train_scaled, sample_weight=train_sample_weights)

In [18]:
cluster_labels = weighted_kmeans.predict(X_train_scaled)
le = LabelEncoder()
true_encoded = le.fit_transform(y_train.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=train_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=train_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 1944234.0001054064
           1       1.00      0.00      0.01 1944234.0000000414

    accuracy                           0.50 3888468.000105448
   macro avg       0.75      0.50      0.34 3888468.000105448
weighted avg       0.75      0.50      0.34 3888468.000105448

Accuracy: 0.5022
Precision: 0.7499
Recall: 0.5022
F1-score: 0.3382


In [19]:
cluster_labels = weighted_kmeans.predict(X_test_scaled)
le = LabelEncoder()
true_encoded = le.fit_transform(y_test.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=test_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=test_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 379152.5000034354
           1       1.00      0.00      0.00 379152.50000005076

    accuracy                           0.50 758305.0000034862
   macro avg       0.75      0.50      0.34 758305.0000034862
weighted avg       0.75      0.50      0.34 758305.0000034862

Accuracy: 0.5007
Precision: 0.7480
Recall: 0.5007
F1-score: 0.3350


In [20]:
# Obtenir les étiquettes de cluster pour chaque point de données
cluster_labels = weighted_kmeans.predict(X_train_scaled)

# Analyser les clusters pour identifier les comportements frauduleux potentiels
def class_cluster_ratio(y_true, clust_preds, cluster):
    cluster_true_labels = y_train.astype(int)[cluster_labels == cluster]
    clus_size = len(cluster_true_labels)
    fraud_freq = np.sum(cluster_true_labels)
    fraud_ratio = np.sum(cluster_true_labels) / len(cluster_true_labels)
    return clus_size, fraud_freq, fraud_ratio

for cluster in range(n_clusters):
  res = class_cluster_ratio(y_train, clust_preds=cluster_labels, cluster=cluster)
  print(f"Cluster {cluster}: Cluster size = {res[0]} - Fraud_flag = {res[1]} - Fraud ratio = {res[2]:.2%}")

Cluster 0: Cluster size = 125 - Fraud_flag = 102 - Fraud ratio = 81.60%
Cluster 1: Cluster size = 3888343 - Fraud_flag = 23244 - Fraud ratio = 0.60%


### DBSCAN

In [21]:
from sklearn.cluster import DBSCAN

In [22]:
# Adaptation de DBSCAN pour prendre en compte les poids
class WeightedDBSCAN(DBSCAN):
    def fit(self, X, sample_weight=None):
        self.sample_weight_ = sample_weight
        return super().fit(X, sample_weight=sample_weight)

    def _weighted_core_samples(self, neighbors, working_memory):
        n_samples = neighbors.shape[0]
        n_neighbors = np.zeros(n_samples, dtype=int)
        for i in range(n_samples):
            n_neighbors[i] = np.sum(self.sample_weight_[neighbors[i]])
        return n_neighbors >= self.min_samples

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Définir le nombre de clusters et le nombre de plis pour la validation croisée
eps_ = 0.5
min_samples_ = 5
n_splits = 5

# Initialiser le modèle de clustering (KMeans dans cet exemple)
weighted_dbscan = WeightedDBSCAN(eps=eps_, min_samples=min_samples_)

# Créer l'objet KFold pour la validation croisée
kf = StratifiedKFold(n_splits=n_splits)

start = time()
train_mean_score, train_std_score, val_mean_score, val_std_score = validation(weighted_dbscan, X_train_scaled, y_train, sample_weights=train_sample_weights, kf=kf, scoring=f1_score)
end = time()

print(f"F1-Score moyen (train): {train_mean_score:.3f} (+/- {train_std_score:.3f})")
print(f"F1-Score moyen (val): {val_mean_score:.3f} (+/- {val_std_score:.3f})")
print(f"Temps de calcul: {end-start}")

#### Métriques de classification

In [None]:
# Création et entraînement du modèle
eps_ = 0.5
min_samples_ = 5
weighted_dbscan = WeightedDBSCAN(eps=eps_, min_samples_=5)
weighted_dbscan.fit(X_train_scaled, sample_weight=train_sample_weights)

In [None]:
cluster_labels = weighted_dbscan.predict(X_train_scaled)
le = LabelEncoder()
true_encoded = le.fit_transform(y_train.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=train_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=train_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 23205.832787552674
           1       1.00      0.00      0.01 23205.832788652417

    accuracy                           0.50 46411.665576205094
   macro avg       0.75      0.50      0.34 46411.665576205094
weighted avg       0.75      0.50      0.34 46411.665576205094

Accuracy: 0.5022
Precision: 0.7499
Recall: 0.5022
F1-score: 0.3382


In [None]:
cluster_labels = weighted_dbscan.predict(X_test_scaled)
le = LabelEncoder()
true_encoded = le.fit_transform(y_test.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=test_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=test_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 6590.699974284719
           1       1.00      0.00      0.00 6590.699974284754

    accuracy                           0.50 13181.399948569473
   macro avg       0.75      0.50      0.34 13181.399948569473
weighted avg       0.75      0.50      0.34 13181.399948569473

Accuracy: 0.5007
Precision: 0.7480
Recall: 0.5007
F1-score: 0.3350


In [None]:
# Obtenir les étiquettes de cluster pour chaque point de données
cluster_labels = weighted_dbscan.predict(X_train_scaled)

# Analyser les clusters pour identifier les comportements frauduleux potentiels
def class_cluster_ratio(y_true, clust_preds, cluster):
    cluster_true_labels = y_train.astype(int)[cluster_labels == cluster]
    clus_size = len(cluster_true_labels)
    fraud_freq = np.sum(cluster_true_labels)
    fraud_ratio = np.sum(cluster_true_labels) / len(cluster_true_labels)
    return clus_size, fraud_freq, fraud_ratio

for cluster in range(n_clusters):
  res = class_cluster_ratio(y_train, clust_preds=cluster_labels, cluster=cluster)
  print(f"Cluster {cluster}: Cluster size = {res[0]} - Fraud_flag = {res[1]} - Fraud ratio = {res[2]:.2%}")

Cluster 0: Cluster size = 125 - Fraud_flag = 102 - Fraud ratio = 81.60%
Cluster 1: Cluster size = 3888343 - Fraud_flag = 23244 - Fraud ratio = 0.60%


### HDBSCAN

In [None]:
!pip instal hdbscan

In [None]:
import hdbscan

# Adaptation de HDBSCAN pour prendre en compte les poids
class WeightedHDBSCAN(hdbscan.HDBSCAN):
    def fit(self, X, sample_weight=None):
        self._raw_data = X
        self._raw_data = self._validate_data(X, accept_sparse='csr')
        self.sample_weight_ = sample_weight
        self._min_samples = max(self.min_samples, self.min_cluster_size)
        self._condensed_tree_ = self._raw_tree = None
        self._single_linkage_tree_ = self._mst = None
        self._cluster_tree = self._condensed_tree = None
        self._allow_single_cluster = False
        self._small_cluster_threshold = 0

        self._clusterer = hdbscan.hdbscan_.HDBSCAN_CLUSTERING(
            self._raw_data,
            self.min_cluster_size,
            self._min_samples,
            self.alpha,
            self.cluster_selection_method,
            self._allow_single_cluster,
            self.metric,
            self.p,
            self._small_cluster_threshold,
            self.algorithm,
            self.leaf_size,
            self.approx_min_span_tree,
            self.gen_min_span_tree,
            self.core_dist_n_jobs,
            self.sample_weight_,
        )
        self._clusterer.run()
        return self

In [None]:
# # Application de HDBSCAN pondéré
# weighted_hdbscan = WeightedHDBSCAN(min_cluster_size=5, min_samples=1)
# clusters = weighted_hdbscan.fit_predict(X_train_scaled, sample_weight=train_sample_weights)

# # Identifier les clusters d'anomalies (potentiellement frauduleux)
# unique_clusters, counts = np.unique(clusters, return_counts=True)
# anomaly_clusters = unique_clusters[counts < np.mean(counts)]

# # Marquer les points comme potentiellement frauduleux
# potential_fraud = np.isin(clusters, anomaly_clusters)

# # Évaluation (exemple simplifié)
# from sklearn.metrics import classification_report
# print(classification_report(y, potential_fraud))


In [None]:
# Application de HDBSCAN pondéré
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Définir le nombre de clusters et le nombre de plis pour la validation croisée
min_cluster_size_ = 5
min_samples_ = 1
n_splits = 5

weighted_hdbscan = WeightedHDBSCAN(min_cluster_size=5, min_samples=1)

# Créer l'objet KFold pour la validation croisée
kf = StratifiedKFold(n_splits=n_splits)

start = time()
train_mean_score, train_std_score, val_mean_score, val_std_score = validation(weighted_hdbscan, X_train_scaled, y_train, sample_weights=train_sample_weights, kf=kf, scoring=f1_score)
end = time()

print(f"F1-Score moyen (train): {train_mean_score:.3f} (+/- {train_std_score:.3f})")
print(f"F1-Score moyen (val): {val_mean_score:.3f} (+/- {val_std_score:.3f})")
print(f"Temps de calcul: {end-start}")

F1-Score moyen (train): 0.337 (+/- 0.002)
F1-Score moyen (val): 0.337 (+/- 0.002)


#### Métriques de classification

In [None]:
# Création et entraînement du modèle
min_cluster_size_ = 5
min_samples_ = 5
weighted_hdbscan = WeightedHDBSCAN(min_cluster_size=5, min_samples=1)
weighted_hdbscan.fit(X_train_scaled, sample_weight=train_sample_weights)

In [None]:
cluster_labels = weighted_hdbscan.predict(X_train_scaled)
le = LabelEncoder()
true_encoded = le.fit_transform(y_train.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=train_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=train_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 23205.832787552674
           1       1.00      0.00      0.01 23205.832788652417

    accuracy                           0.50 46411.665576205094
   macro avg       0.75      0.50      0.34 46411.665576205094
weighted avg       0.75      0.50      0.34 46411.665576205094

Accuracy: 0.5022
Precision: 0.7499
Recall: 0.5022
F1-score: 0.3382


In [None]:
cluster_labels = weighted_hdbscan.predict(X_test_scaled)
le = LabelEncoder()
true_encoded = le.fit_transform(y_test.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=test_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=test_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 6590.699974284719
           1       1.00      0.00      0.00 6590.699974284754

    accuracy                           0.50 13181.399948569473
   macro avg       0.75      0.50      0.34 13181.399948569473
weighted avg       0.75      0.50      0.34 13181.399948569473

Accuracy: 0.5007
Precision: 0.7480
Recall: 0.5007
F1-score: 0.3350


In [None]:
# Obtenir les étiquettes de cluster pour chaque point de données
cluster_labels = weighted_hdbscan.predict(X_train_scaled)

# Analyser les clusters pour identifier les comportements frauduleux potentiels
def class_cluster_ratio(y_true, clust_preds, cluster):
    cluster_true_labels = y_train.astype(int)[cluster_labels == cluster]
    clus_size = len(cluster_true_labels)
    fraud_freq = np.sum(cluster_true_labels)
    fraud_ratio = np.sum(cluster_true_labels) / len(cluster_true_labels)
    return clus_size, fraud_freq, fraud_ratio

for cluster in range(n_clusters):
  res = class_cluster_ratio(y_train, clust_preds=cluster_labels, cluster=cluster)
  print(f"Cluster {cluster}: Cluster size = {res[0]} - Fraud_flag = {res[1]} - Fraud ratio = {res[2]:.2%}")

Cluster 0: Cluster size = 125 - Fraud_flag = 102 - Fraud ratio = 81.60%
Cluster 1: Cluster size = 3888343 - Fraud_flag = 23244 - Fraud ratio = 0.60%


### Clustering spectral

In [None]:
from sklearn.cluster import SpectralClustering

# Création de la matrice d'affinité pondérée
def weighted_rbf_kernel(X, sample_weights, gamma=1.0):
    pairwise_sq_dists = np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1)
    K = np.exp(-gamma * pairwise_sq_dists)
    return K * sample_weights[:, np.newaxis] * sample_weights[np.newaxis, :]

train_affinity_matrix = weighted_rbf_kernel(X_train_scaled, sample_weights)
test_affinity_matrix = weighted_rbf_kernel(X_test_scaled, sample_weights)

In [None]:
from sklearn.preprocessing import LabelEncoder
def spectral_validation(model, X, y, sample_weights, kf, scoring):
  # Liste pour stocker les scores de silhouette
  train_scores = []
  val_scores = []

  # Effectuer la validation croisée
  for train_index, val_index in kf.split(X,y):
      # Étape 1 : Découper selon les indices des folds
      x_train, x_val = X[train_index], X[val_index]
      y_train, y_val = y[train_index].astype(int), y[val_index].astype(int)
      train_weights, val_weights = sample_weights[train_index], sample_weights[val_index]
      train_aff_mat, val_aff_mat = weighted_rbf_kernel(x_train, train_weights), weighted_rbf_kernel(x_val, val_weights)

      # Étape 2 : Entraîner le modèle sur les données d'entraînement
      model.fit(train_affinity_matrix)

      # Étape 3 : Prédire les clusters pour les données de validation
      train_cluster_labels = model.predict(train_aff_mat)
      val_cluster_labels = model.predict(val_aff_mat)

      # Étape 4 : Encoder les étiquettes réelles et prédites
      le = LabelEncoder()
      train_true_encoded = le.fit_transform(y_train)
      val_true_encoded = le.transform(y_val)

      train_cluster_encoded = le.transform(train_cluster_labels)
      val_cluster_encoded = le.transform(val_cluster_labels)
      # Étape 5 : Trouver la meilleure correspondance entre clusters et classes réelles
      train_mapped_labels = find_best_mapping(train_true_encoded, train_cluster_encoded)
      val_mapped_labels = find_best_mapping(val_true_encoded, val_cluster_encoded)
      # Etape 6 : Calculer la métrique
      score = scoring(train_true_encoded, train_mapped_labels, average='weighted', sample_weight=train_weights)
      train_scores.append(score)
      score = scoring(val_true_encoded, val_mapped_labels, average='weighted', sample_weight=val_weights)
      val_scores.append(score)

  # Étape 7 : Calculer la moyenne et l'écart-type des scores de silhouette
  train_mean_score = np.mean(train_scores)
  train_std_score = np.std(train_scores)
  val_mean_score = np.mean(val_scores)
  val_std_score = np.std(val_scores)
  return train_mean_score, train_std_score, val_mean_score, val_std_score

In [None]:
# Application de HDBSCAN pondéré
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Définir le nombre de clusters et le nombre de plis pour la validation croisée
n_cluster_ = 2
n_splits = 5

weighted_spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)

# Créer l'objet KFold pour la validation croisée
kf = StratifiedKFold(n_splits=n_splits)

start = time()
train_mean_score, train_std_score, val_mean_score, val_std_score = validation(weighted_spectral, X_train_scaled, y_train, sample_weights=train_sample_weights, kf=kf, scoring=f1_score)
end = time()

print(f"F1-Score moyen (train): {train_mean_score:.3f} (+/- {train_std_score:.3f})")
print(f"F1-Score moyen (val): {val_mean_score:.3f} (+/- {val_std_score:.3f})")
print(f"Temps de calcul: {end-start}")

F1-Score moyen (train): 0.337 (+/- 0.002)
F1-Score moyen (val): 0.337 (+/- 0.002)


#### Métriques de classification

In [None]:
# Création et entraînement du modèle
n_cluster_ = 5
min_samples_ = 1
weighted_spectral = SpectralClustering(n_cluster=n_cluster_, min_samples=min_samples_)
weighted_spectral.fit(train_affinity_matrix)

In [None]:
cluster_labels = weighted_spectral.predict(train_affinity_matrix)
le = LabelEncoder()
true_encoded = le.fit_transform(y_train.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=train_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=train_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 23205.832787552674
           1       1.00      0.00      0.01 23205.832788652417

    accuracy                           0.50 46411.665576205094
   macro avg       0.75      0.50      0.34 46411.665576205094
weighted avg       0.75      0.50      0.34 46411.665576205094

Accuracy: 0.5022
Precision: 0.7499
Recall: 0.5022
F1-score: 0.3382


In [None]:
cluster_labels = weighted_spectral.predict(test_affinity_matrix)
le = LabelEncoder()
true_encoded = le.fit_transform(y_test.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=test_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=test_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 6590.699974284719
           1       1.00      0.00      0.00 6590.699974284754

    accuracy                           0.50 13181.399948569473
   macro avg       0.75      0.50      0.34 13181.399948569473
weighted avg       0.75      0.50      0.34 13181.399948569473

Accuracy: 0.5007
Precision: 0.7480
Recall: 0.5007
F1-score: 0.3350


In [None]:
# Obtenir les étiquettes de cluster pour chaque point de données
cluster_labels = weighted_spectral.predict(train_affinity_matrix)

# Analyser les clusters pour identifier les comportements frauduleux potentiels
def class_cluster_ratio(y_true, clust_preds, cluster):
    cluster_true_labels = y_train.astype(int)[cluster_labels == cluster]
    clus_size = len(cluster_true_labels)
    fraud_freq = np.sum(cluster_true_labels)
    fraud_ratio = np.sum(cluster_true_labels) / len(cluster_true_labels)
    return clus_size, fraud_freq, fraud_ratio

for cluster in range(n_clusters):
  res = class_cluster_ratio(y_train, clust_preds=cluster_labels, cluster=cluster)
  print(f"Cluster {cluster}: Cluster size = {res[0]} - Fraud_flag = {res[1]} - Fraud ratio = {res[2]:.2%}")

Cluster 0: Cluster size = 125 - Fraud_flag = 102 - Fraud ratio = 81.60%
Cluster 1: Cluster size = 3888343 - Fraud_flag = 23244 - Fraud ratio = 0.60%


### One Class SVM

In [None]:
from sklearn.preprocessing import LabelEncoder
def OneClass_SVM_validation(model, X, y, sample_weights, kf, scoring):
  # Liste pour stocker les scores de silhouette
  train_scores = []
  val_scores = []

  # Effectuer la validation croisée
  for train_index, val_index in kf.split(X,y):
      # Étape 1 : Découper selon les indices des folds
      x_train, x_val = X[train_index], X[val_index]
      y_train, y_val = y[train_index].astype(int), y[val_index].astype(int)

      # Étape 2 : Entraîner le modèle sur les données d'entraînement
      model.fit(x_train, y_train)

      # Étape 3 : Prédire les clusters pour les données de validation
      train_cluster_labels = model.predict(x_train)
      val_cluster_labels = model.predict(x_val)

      # Étape 4 : Encoder les étiquettes réelles et prédites
      le = LabelEncoder()
      train_true_encoded = le.fit_transform(y_train)
      val_true_encoded = le.transform(y_val)

      train_cluster_encoded = le.transform(train_cluster_labels)
      val_cluster_encoded = le.transform(val_cluster_labels)
      # Étape 5 : Trouver la meilleure correspondance entre clusters et classes réelles
      train_mapped_labels = find_best_mapping(train_true_encoded, train_cluster_encoded)
      val_mapped_labels = find_best_mapping(val_true_encoded, val_cluster_encoded)
      # Etape 6 : Calculer la métrique
      score = scoring(train_true_encoded, train_mapped_labels, average='weighted', sample_weight=train_weights)
      train_scores.append(score)
      score = scoring(val_true_encoded, val_mapped_labels, average='weighted', sample_weight=val_weights)
      val_scores.append(score)

  # Étape 7 : Calculer la moyenne et l'écart-type des scores de silhouette
  train_mean_score = np.mean(train_scores)
  train_std_score = np.std(train_scores)
  val_mean_score = np.mean(val_scores)
  val_std_score = np.std(val_scores)
  return train_mean_score, train_std_score, val_mean_score, val_std_score

In [None]:
# Application de HDBSCAN pondéré
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

weighted_spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)

# Créer l'objet KFold pour la validation croisée
kf = StratifiedKFold(n_splits=n_splits)

start = time()
train_mean_score, train_std_score, val_mean_score, val_std_score = validation(weighted_spectral, X_train_scaled, y_train, sample_weights=train_sample_weights, kf=kf, scoring=f1_score)
end = time()

print(f"F1-Score moyen (train): {train_mean_score:.3f} (+/- {train_std_score:.3f})")
print(f"F1-Score moyen (val): {val_mean_score:.3f} (+/- {val_std_score:.3f})")
print(f"Temps de calcul: {end-start}")

F1-Score moyen (train): 0.337 (+/- 0.002)
F1-Score moyen (val): 0.337 (+/- 0.002)


#### Métriques de classification

In [None]:
# Création et entraînement du modèle
n_cluster_ = 5
min_samples_ = 1
weighted_spectral = SpectralClustering(n_cluster=n_cluster_, min_samples=min_samples_)
weighted_spectral.fit(train_affinity_matrix)

In [None]:
cluster_labels = weighted_spectral.predict(train_affinity_matrix)
le = LabelEncoder()
true_encoded = le.fit_transform(y_train.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=train_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=train_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=train_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 23205.832787552674
           1       1.00      0.00      0.01 23205.832788652417

    accuracy                           0.50 46411.665576205094
   macro avg       0.75      0.50      0.34 46411.665576205094
weighted avg       0.75      0.50      0.34 46411.665576205094

Accuracy: 0.5022
Precision: 0.7499
Recall: 0.5022
F1-score: 0.3382


In [None]:
cluster_labels = weighted_spectral.predict(test_affinity_matrix)
le = LabelEncoder()
true_encoded = le.fit_transform(y_test.astype(int))
cluster_encoded = le.transform(cluster_labels)

mapped_labels = find_best_mapping(true_encoded, cluster_encoded)
print(classification_report(true_encoded, mapped_labels, sample_weight=test_sample_weights))

accuracy = accuracy_score(true_encoded, mapped_labels, sample_weight=test_sample_weights)
precision = precision_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
recall = recall_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)
f1 = f1_score(true_encoded, mapped_labels, average='weighted', sample_weight=test_sample_weights)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

              precision    recall  f1-score   support

           0       0.50      1.00      0.67 6590.699974284719
           1       1.00      0.00      0.00 6590.699974284754

    accuracy                           0.50 13181.399948569473
   macro avg       0.75      0.50      0.34 13181.399948569473
weighted avg       0.75      0.50      0.34 13181.399948569473

Accuracy: 0.5007
Precision: 0.7480
Recall: 0.5007
F1-score: 0.3350


In [None]:
from sklearn.svm import OneClassSVM

# Définition des paramètres du modèle
nu = pd.Series(y_train).value_counts(normalize=True).iloc[1]  # Fraction d'outliers attendue
gamma = 'scale'  # Paramètre du noyau RBF

# Création et entraînement du modèle One-Class SVM
oc_svm = OneClassSVM(kernel='rbf', nu=nu, gamma=gamma)
oc_svm.fit(X_train_scaled)

# Prédiction
y_pred = oc_svm.predict(X_train_scaled)

# Les valeurs -1 indiquent les anomalies (classe minoritaire)
anomalies = X_train_scaled[y_pred == -1]


### Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

# Définition des paramètres du modèle
contamination = pd.Series(y_train).value_counts(normalize=True).iloc[1]  # Fraction d'outliers attendue
n_estimators = 100
max_samples = 'auto'

# Création et entraînement du modèle Isolation Forest
iso_forest = IsolationForest(
    contamination=contamination,
    n_estimators=n_estimators,
    max_samples=max_samples,
    random_state=42
)
iso_forest.fit(X_train_scaled)

# Prédiction
y_pred = iso_forest.predict(X_train_scaled)

# Les valeurs -1 indiquent les anomalies (classe minoritaire)
anomalies = X_train_scaled[y_pred == -1]
