In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate

In [4]:
#Chargement des données
data = pd.read_csv("creditcard.csv")

#Normalisation des colonnes
data["Amount"] = StandardScaler().fit_transform(data[["Amount"]])
data["Time"] = StandardScaler().fit_transform(data[["Time"]])

In [11]:
#View the data of class 1
data[data["Class"] == 1].head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,-1.988034,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,-0.353229,1
623,-1.986644,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,1.761758,1
4920,-1.902623,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,0.606031,1
6108,-1.849472,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,-0.117342,1
6329,-1.838248,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,-0.349231,1


In [5]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [6]:
cv = StratifiedKFold(n_splits=10)

## Algorithme Naïf

In [8]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="constant", constant=1, random_state = 0)
dummy_clf_results = cross_validate(dummy_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [9]:
print("Mean F1-score : {:.2f}".format(np.mean(dummy_clf_results['test_score'])))

Mean F1-score : 0.00


In [10]:
#save the model in file
import pickle
dummy_clf_best = dummy_clf_results['estimator'][np.argmax(dummy_clf_results['test_score'])]
filename = './modeles/dummy_clf_best.sav'
pickle.dump(dummy_clf_best, open(filename, 'wb'))

## Algorithmes linéaires

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

log_reg_clf = LogisticRegression(random_state=0)
log_reg_clf_results = cross_validate(log_reg_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [12]:
print("Mean F1-score : {:.3f}".format(np.mean(log_reg_clf_results['test_score'])))

Mean F1-score : 0.680


In [46]:
#save the model in file
import pickle
log_reg_clf_best = log_reg_clf_results['estimator'][np.argmax(log_reg_clf_results['test_score'])]
filename = './modeles/log_reg_clf_best.sav'
pickle.dump(log_reg_clf_best, open(filename, 'wb'))

### Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()
nb_clf_results = cross_validate(nb_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [13]:
print("Mean F1-score : {:.3f}".format(np.mean(nb_clf_results['test_score'])))

Mean F1-score : 0.115


In [48]:
#best model
nb_clf_best = nb_clf_results['estimator'][np.argmax(nb_clf_results['test_score'])]
filename = './modeles/nb_clf_best.sav'
pickle.dump(nb_clf_best, open(filename, 'wb'))

### Linear Discriminant Analysis

In [25]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_clf = LinearDiscriminantAnalysis()
lda_clf_results = cross_validate(lda_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [26]:
print("Mean F1-score : {:.3f}".format(np.mean(lda_clf_results['test_score'])))

Mean F1-score : 0.785


In [49]:
#best model
lda_clf_best = lda_clf_results['estimator'][np.argmax(lda_clf_results['test_score'])]
filename = './modeles/lda_clf_best.sav'
pickle.dump(lda_clf_best, open(filename, 'wb'))

## Algorithmes Non-linéaires

### Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_clf = DecisionTreeClassifier(random_state = 0)
decision_tree_clf_results = cross_validate(decision_tree_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [29]:
print("Mean F1-score : {:.3f}".format(np.mean(decision_tree_clf_results['test_score'])))

Mean F1-score : 0.590


In [50]:
#best model
decision_tree_clf_best = decision_tree_clf_results['estimator'][np.argmax(decision_tree_clf_results['test_score'])]
filename = './modeles/decision_tree_clf_best.sav'
pickle.dump(decision_tree_clf_best, open(filename, 'wb'))

### SVM

In [31]:
from sklearn.svm import SVC

svc_clf = SVC(random_state = 0)
svc_clf_results = cross_validate(svc_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [32]:
print("Mean F1-score : {:.3f}".format(np.mean(svc_clf_results['test_score'])))

Mean F1-score : 0.634


In [51]:
#best model
svc_clf_best = svc_clf_results['estimator'][np.argmax(svc_clf_results['test_score'])]
filename = './modeles/svc_clf_best.sav'
pickle.dump(svc_clf_best, open(filename, 'wb'))

### KNN

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf_results = cross_validate(knn_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [35]:
print("Mean F1-score : {:.3f}".format(np.mean(knn_clf_results['test_score'])))

Mean F1-score : 0.763


In [52]:
#best model
knn_clf_best = knn_clf_results['estimator'][np.argmax(knn_clf_results['test_score'])]
filename = './modeles/knn_clf_best.sav'
pickle.dump(knn_clf_best, open(filename, 'wb'))

## Algorithmes Ensemblistes

### Bagged Trees

In [37]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(random_state=0)
bag_clf = BaggingClassifier(tree_clf, n_estimators=100, max_samples=0.8, max_features=1.0, bootstrap=True, random_state=0)
bag_clf_results = cross_validate(bag_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [38]:
print("Mean F1-score : {:.3f}".format(np.mean(bag_clf_results['test_score'])))

Mean F1-score : 0.725


In [53]:
#best model
bag_clf_best = bag_clf_results['estimator'][np.argmax(bag_clf_results['test_score'])]
filename = './modeles/bag_clf_best.sav'
pickle.dump(bag_clf_best, open(filename, 'wb'))

### Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state = 0)
rf_clf_results = cross_validate(rf_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [41]:
print("Mean F1-score : {:.3f}".format(np.mean(rf_clf_results['test_score'])))

Mean F1-score : 0.731


In [54]:
#best model
rf_clf_best = rf_clf_results['estimator'][np.argmax(rf_clf_results['test_score'])]
filename = './modeles/rf_clf_best.sav'
pickle.dump(rf_clf_best, open(filename, 'wb'))

### SGB

In [43]:
from sklearn.ensemble import GradientBoostingClassifier

sgb_clf = GradientBoostingClassifier(random_state = 0)
sgb_clf_results = cross_validate(sgb_clf, X, y, cv=cv, n_jobs = -1, scoring = "f1", return_train_score=True, return_estimator=True)

In [44]:
print("Mean F1-score : {:.3f}".format(np.mean(sgb_clf_results['test_score'])))

Mean F1-score : 0.460


In [55]:
#best model
sgb_clf_best = sgb_clf_results['estimator'][np.argmax(sgb_clf_results['test_score'])]
filename = './modeles/sgb_clf_best.sav'
pickle.dump(sgb_clf_best, open(filename, 'wb'))