In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline
import pickle

In [2]:
#Chargement des données
data = pd.read_csv("creditcard.csv")

#Normalisation des colonnes
data["Amount"] = StandardScaler().fit_transform(data[["Amount"]])
data["Time"] = StandardScaler().fit_transform(data[["Time"]])

In [3]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [4]:
cv = StratifiedKFold(n_splits=10)

# Algorithmes de ré-échantillonage

In [5]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_clf = DecisionTreeClassifier(random_state = 0)

## Sur-échantillonage

### RandomOverSampler

In [6]:
from imblearn.over_sampling import RandomOverSampler
pipeline_ros = Pipeline(steps=[('over', RandomOverSampler(random_state=0)), ('model', decision_tree_clf)])

In [7]:
decision_tree_clf_ros_results = cross_validate(pipeline_ros, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [8]:
print("Mean F1-score : {:.3f}".format(np.mean(decision_tree_clf_ros_results['test_score'])))

Mean F1-score : 0.567


In [12]:
#save best model
best_model = decision_tree_clf_ros_results['estimator'][np.argmax(decision_tree_clf_ros_results['test_score'])]
filename = './modeles/decision_tree_clf_ros_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### SMOTE

In [14]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

param_grid = {
    'smote__sampling_strategy': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

pipeline_smote = Pipeline(steps=[('smote', SMOTE(random_state=0)), ('model', decision_tree_clf)])


In [103]:

# Define the GridSearchCV object
grid_search = GridSearchCV(
    pipeline_smote,
    param_grid=param_grid,
    cv=cv,
    scoring="f1_micro",
    n_jobs=-1,
    error_score='raise'  # raise an error if any fit fails
)

# Fit the GridSearchCV object on the data
grid_search.fit(X, y)

# Print the best mean score and the corresponding parameters
print("Best mean F1-score : {:.3f}".format(grid_search.best_score_))
print("Best parameters : ", grid_search.best_params_)

Best mean F1-score : 0.995
Best parameters :  {'smote__sampling_strategy': 1}


In [15]:
pipeline_smote_results = cross_validate(pipeline_smote, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [16]:
print("Mean F1-score : {:.3f}".format(np.mean(pipeline_smote_results['test_score'])))

Mean F1-score : 0.995


In [18]:
#save best model
best_model = pipeline_smote_results['estimator'][np.argmax(pipeline_smote_results['test_score'])]
filename = './modeles/decision_tree_clf_smote_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### Bordeline SMOTE

In [19]:
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import GridSearchCV

param_grid = {
    'smote__sampling_strategy': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

pipeline_bordeline_smote = Pipeline(steps=[('smote', BorderlineSMOTE(random_state=0)), ('model', decision_tree_clf)])


In [107]:

# Define the GridSearchCV object
grid_search = GridSearchCV(
    pipeline_bordeline_smote,
    param_grid=param_grid,
    cv=cv,
    scoring="f1_micro",
    n_jobs=-1,
    error_score='raise'  # raise an error if any fit fails
)

# Fit the GridSearchCV object on the data
grid_search.fit(X, y)

# Print the best mean score and the corresponding parameters
print("Best mean F1-score : {:.3f}".format(grid_search.best_score_))
print("Best parameters : ", grid_search.best_params_)

Best mean F1-score : 0.916
Best parameters :  {'smote__sampling_strategy': 0.5}


In [20]:
pipeline_bordeline_smote_results = cross_validate(pipeline_bordeline_smote, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [21]:
print("Mean F1-score : {:.3f}".format(np.mean(pipeline_bordeline_smote_results['test_score'])))

Mean F1-score : 0.909


In [22]:
#save best model
best_model = pipeline_bordeline_smote_results['estimator'][np.argmax(pipeline_bordeline_smote_results['test_score'])]
filename = './modeles/decision_tree_clf_bordeline_smote_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### K-Means SMOTE

In [25]:
from imblearn.over_sampling import KMeansSMOTE
from sklearn.model_selection import GridSearchCV

param_grid = {
    'smote__sampling_strategy': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

pipeline_kmeans_smote = Pipeline(steps=[('smote', KMeansSMOTE(random_state=0, cluster_balance_threshold=0.001)), ('model', decision_tree_clf)])

In [9]:


# Define the GridSearchCV object
grid_search = GridSearchCV(
    pipeline_kmeans_smote,
    param_grid=param_grid,
    cv=cv,
    scoring="f1_micro",
    n_jobs=-1,
    error_score='raise'  # raise an error if any fit fails
)

# Fit the GridSearchCV object on the data
grid_search.fit(X, y)

# Print the best mean score and the corresponding parameters
print("Best mean F1-score : {:.3f}".format(grid_search.best_score_))
print("Best parameters : ", grid_search.best_params_)


Best mean F1-score : 0.901
Best parameters :  {'smote__sampling_strategy': 0.6}


In [26]:
pipeline_kmeans_smote_results = cross_validate(pipeline_kmeans_smote, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [27]:
print("Mean F1-score : {:.3f}".format(np.mean(pipeline_kmeans_smote_results['test_score'])))

Mean F1-score : 0.896


In [28]:
#save best model
best_model = pipeline_kmeans_smote_results['estimator'][np.argmax(pipeline_kmeans_smote_results['test_score'])]
filename = './modeles/decision_tree_clf_kmeans_smote_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Sous-échantillonage

In [30]:
from imblearn.under_sampling import RandomUnderSampler
pipeline_rus = Pipeline(steps=[('under', RandomUnderSampler(random_state=0)), ('model', decision_tree_clf)])

In [31]:
decision_tree_clf_rus_results = cross_validate(pipeline_rus, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [32]:
print("Mean F1-score : {:.3f}".format(np.mean(decision_tree_clf_rus_results['test_score'])))

Mean F1-score : 0.888


In [33]:
#save best model
best_model = decision_tree_clf_rus_results['estimator'][np.argmax(decision_tree_clf_rus_results['test_score'])]
filename = './modeles/decision_tree_clf_rus_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### Edited Nearest Neighbour

In [7]:
from imblearn.under_sampling import EditedNearestNeighbours

pipeline_enn = Pipeline(steps=[('enn', EditedNearestNeighbours()), ('model', decision_tree_clf)])

In [8]:
decision_tree_clf_enn_results = cross_validate(pipeline_enn, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [9]:
print("Mean F1-score : {:.3f}".format(np.mean(decision_tree_clf_enn_results['test_score'])))

Mean F1-score : 0.899


In [10]:
#save best model
best_model = decision_tree_clf_enn_results['estimator'][np.argmax(decision_tree_clf_enn_results['test_score'])]
filename = './modeles/decision_tree_clf_enn_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### Tomek Links

In [6]:
from imblearn.under_sampling import TomekLinks

pipeline_tomek = Pipeline(steps=[('tomek', TomekLinks()), ('model', decision_tree_clf)])

In [7]:
decision_tree_clf_tomek_results = cross_validate(pipeline_tomek, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [8]:
print("Mean F1-score : {:.3f}".format(np.mean(decision_tree_clf_tomek_results['test_score'])))

Mean F1-score : 0.901


In [9]:
#save best model
best_model = decision_tree_clf_tomek_results['estimator'][np.argmax(decision_tree_clf_tomek_results['test_score'])]
filename = './modeles/decision_tree_clf_tomek_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Combinaison

### RandomOverSampler + RandomUnderSampler

In [10]:
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline

# Define the pipeline
pipeline_ros_rus = Pipeline(steps=[
    ('over', RandomOverSampler(random_state=0)),
    ('under', RandomUnderSampler(random_state=0)),
    ('model', DecisionTreeClassifier(random_state=0))
])

In [94]:

# Define the parameter grid for the samplers
param_grid = {
    'over__sampling_strategy': [0.1, 0.2, 0.3, 0.4, 0.5],
    'under__sampling_strategy': [0.6, 0.7, 0.8, 0.9, 1]
}

# Define the GridSearchCV object
grid_search = GridSearchCV(
    pipeline_ros_rus,
    param_grid=param_grid,
    cv=cv,
    scoring="f1_micro",
    n_jobs=-1,
    error_score='raise'  # raise an error if any fit fails
)

# Fit the GridSearchCV object on the data
grid_search.fit(X, y)

# Print the best mean score and the corresponding parameters
print("Best mean F1-score : {:.3f}".format(grid_search.best_score_))
print("Best parameters : ", grid_search.best_params_)

Best mean F1-score : 0.998
Best parameters :  {'over__sampling_strategy': 0.5, 'under__sampling_strategy': 0.6}


In [11]:
pipeline_ros_rus_results = cross_validate(pipeline_ros_rus, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [12]:
print("Mean F1-score : {:.3f}".format(np.mean(pipeline_ros_rus_results['test_score'])))

Mean F1-score : 0.990


In [13]:
#save best model
best_model = pipeline_ros_rus_results['estimator'][np.argmax(pipeline_ros_rus_results['test_score'])]
filename = './modeles/pipeline_ros_rus_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### Smote + RandomUnderSampler

In [14]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Define the pipeline
pipeline_smote_rus = Pipeline(steps=[
    ('smote', SMOTE(random_state=0)),
    ('under', RandomUnderSampler(random_state=0)),
    ('model', DecisionTreeClassifier(random_state=0))
])

In [10]:

# Define the parameter grid for the samplers
param_grid = {
    'smote__sampling_strategy': [0.1, 0.2, 0.3, 0.4, 0.5],
    'under__sampling_strategy': [0.6, 0.7, 0.8, 0.9, 1]
}

# Define the GridSearchCV object
grid_search = GridSearchCV(
    pipeline_smote_rus,
    param_grid=param_grid,
    cv=cv,
    scoring="f1_micro",
    n_jobs=-1,
    error_score='raise'  # raise an error if any fit fails
)

# Fit the GridSearchCV object on the data
grid_search.fit(X, y)

# Print the best mean score and the corresponding parameters
print("Best mean F1-score : {:.3f}".format(grid_search.best_score_))
print("Best parameters : ", grid_search.best_params_)

Best mean F1-score : 0.995
Best parameters :  {'smote__sampling_strategy': 0.5, 'under__sampling_strategy': 0.8}


In [15]:
pipeline_smote_rus_results = cross_validate(pipeline_smote_rus, X, y, cv=cv, n_jobs = -1, scoring = "f1_micro", return_train_score=True, return_estimator=True)

In [16]:
print("Mean F1-score : {:.3f}".format(np.mean(pipeline_smote_rus_results['test_score'])))

Mean F1-score : 0.995


In [17]:
#save best model
best_model = pipeline_smote_rus_results['estimator'][np.argmax(pipeline_smote_rus_results['test_score'])]
filename = './modeles/pipeline_smote_rus_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### SMOTEEEN

In [21]:
from imblearn.combine import SMOTEENN

pipeline_smoteenn = Pipeline(steps=[('smoteenn', SMOTEENN(random_state=0)), ('model', decision_tree_clf)])

decision_tree_clf_smoteenn_results = cross_validate(pipeline_smoteenn, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(decision_tree_clf_smoteenn_results['test_score'])))

Mean F1-score on the test set : 0.994


In [22]:
#save best model
best_model = decision_tree_clf_smoteenn_results['estimator'][np.argmax(decision_tree_clf_smoteenn_results['test_score'])]
filename = './modeles/decision_tree_clf_smoteenn_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### SMOTETomek

In [23]:
from imblearn.combine import SMOTETomek

pipeline_smotetomek = Pipeline(steps=[('smotetomek', SMOTETomek(random_state=0)), ('model', decision_tree_clf)])

decision_tree_clf_smotetomek_results = cross_validate(pipeline_smotetomek, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(decision_tree_clf_smotetomek_results['test_score'])))

Mean F1-score on the test set : 0.995


In [24]:
#save best model
best_model = decision_tree_clf_smotetomek_results['estimator'][np.argmax(decision_tree_clf_smotetomek_results['test_score'])]
filename = './modeles/decision_tree_clf_smotetomek_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

# Algorithmes sensibles au coût

## Non linéaires

### SVC with class_weight = balanced

In [6]:
#SVC with class_weight = balanced
from sklearn.svm import SVC

svc_clf_balanced = SVC(class_weight='balanced', random_state=0)

In [7]:
svc_clf_balanced_results = cross_validate(svc_clf_balanced, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(svc_clf_balanced_results['test_score'])))

Mean F1-score on the test set : 0.995


In [8]:
#save best model
best_model = svc_clf_balanced_results['estimator'][np.argmax(svc_clf_balanced_results['test_score'])]
filename = './modeles/svc_clf_balanced_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### Decision Tree with class_weight = balanced

In [9]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_balanced_clf = DecisionTreeClassifier(class_weight='balanced', random_state=0)

In [10]:
decision_tree_balanced_clf_results = cross_validate(decision_tree_balanced_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(decision_tree_balanced_clf_results['test_score'])))

Mean F1-score on the test set : 0.990


In [12]:
#save best model
best_model = decision_tree_balanced_clf_results['estimator'][np.argmax(decision_tree_balanced_clf_results['test_score'])]
filename = './modeles/decision_tree_balanced_clf_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Algorithmes linéaires

### Logistic Regression with class_weight = balanced

In [7]:
from sklearn.linear_model import LogisticRegression

logistic_regression_balanced_clf = LogisticRegression(class_weight='balanced', random_state=0)

In [8]:
logistic_regression_balanced_clf_results = cross_validate(logistic_regression_balanced_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(logistic_regression_balanced_clf_results['test_score'])))

Mean F1-score on the test set : 0.973


In [9]:
#save best model
best_model = logistic_regression_balanced_clf_results['estimator'][np.argmax(logistic_regression_balanced_clf_results['test_score'])]
filename = './modeles/logistic_regression_balanced_clf_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Algorithmes ensemblistes

### Bagged Decision Tree with class_weight = balanced

In [14]:
from sklearn.ensemble import BaggingClassifier

bagging_balanced_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', random_state=0), random_state=0)

In [15]:
bagging_balanced_clf_results = cross_validate(bagging_balanced_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(bagging_balanced_clf_results['test_score'])))

Mean F1-score on the test set : 0.999


In [16]:
#save best model
best_model = bagging_balanced_clf_results['estimator'][np.argmax(bagging_balanced_clf_results['test_score'])]
filename = './modeles/bagging_balanced_clf_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### Random Forest with class_weight = balanced

In [17]:
from sklearn.ensemble import RandomForestClassifier

random_forest_baalanced__clf = RandomForestClassifier(class_weight='balanced', random_state=0)

In [18]:
random_forest_baalanced__clf_results = cross_validate(random_forest_baalanced__clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(random_forest_baalanced__clf_results['test_score'])))

Mean F1-score on the test set : 0.999


In [19]:
#save best model
best_model = random_forest_baalanced__clf_results['estimator'][np.argmax(random_forest_baalanced__clf_results['test_score'])]
filename = './modeles/random_forest_baalanced_clf_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

### SGB with class_weight = balanced

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

sgb_balanced_clf = GradientBoostingClassifier(random_state=0)

In [21]:
sgb_balanced_clf_results = cross_validate(sgb_balanced_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(sgb_balanced_clf_results['test_score'])))

Mean F1-score on the test set : 0.901


In [22]:
#save best model
best_model = sgb_balanced_clf_results['estimator'][np.argmax(sgb_balanced_clf_results['test_score'])]
filename = './modeles/sgb_balanced_clf_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

# Algorithmes à classe unique

## OneClassSVM

In [None]:
from sklearn.svm import OneClassSVM

one_class_svm_clf = OneClassSVM()

one_class_svm_clf_results = cross_validate(one_class_svm_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(one_class_svm_clf_results['test_score'])))

In [None]:
#save best model
best_model = one_class_svm_clf_results['estimator'][np.argmax(one_class_svm_clf_results['test_score'])]
filename = './modeles/one_class_svm_clf_results.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Isolation Forest

In [6]:
from sklearn.ensemble import IsolationForest

isolation_forest_clf = IsolationForest(random_state=0)

isolation_forest_clf_results = cross_validate(isolation_forest_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(isolation_forest_clf_results['test_score'])))

Mean F1-score on the test set : 0.000


## Minimum Covariance Determinant

In [7]:
from sklearn.covariance import EllipticEnvelope

minimum_covariance_determinant_clf = EllipticEnvelope(random_state=0)

minimum_covariance_determinant_clf_results = cross_validate(minimum_covariance_determinant_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(minimum_covariance_determinant_clf_results['test_score'])))

Mean F1-score on the test set : 0.000


## Local Outlier Factor

In [8]:
from sklearn.neighbors import LocalOutlierFactor

local_outlier_factor_clf = LocalOutlierFactor()

local_outlier_factor_clf_results = cross_validate(local_outlier_factor_clf, X, y, cv=cv, scoring='f1_micro', n_jobs=-1, return_train_score=True, return_estimator=True)

print("Mean F1-score on the test set : {:.3f}".format(np.mean(local_outlier_factor_clf_results['test_score'])))

Mean F1-score on the test set : nan


# Threshold tuning