In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import  StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

In [None]:
# Import dataset
df = pd.read_csv("https://lead-program-assets.s3.eu-west-3.amazonaws.com/M05-Projects/fraudTest.csv", index_col=0)

In [None]:
df['distance'] = (((df['lat'] - df['merch_lat'])*np.cos(np.radians((df['long'] + df['merch_long'])/2)))**2 + (df['long'] - df['merch_long'])**2)**1/2*111.12
df['age'] = pd.to_numeric(2025 - pd.to_datetime(df['dob']).dt.year)
df['trans_dayofweek'] = pd.to_datetime(df['trans_date_trans_time']).dt.day_name()
df['trans_month'] = pd.to_datetime(df['trans_date_trans_time']).dt.month_name()
# df['group_merchant'] = df['merchant'].str.removeprefix("fraud_").str.split().str[0].str.split("-").str[0].str.removesuffix(",")

In [None]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

In [None]:
df_prep = df.drop(columns=['trans_date_trans_time', 'unix_time','first', 'last', 'street', 'city','lat', 'long', 'job', 'dob', 'merchant', 'merch_lat', 'merch_long', 'trans_num'])

In [None]:
# Basic stats
print("Number of rows : {}".format(df_prep.shape[0]))
print()

print("Display of dataset: ")
display(df_prep.head())
print()

print("Basics statistics: ")
data_desc = df_prep.describe(include='all')
display(data_desc)
print()

In [None]:
# Separate target variable Y from features X
print("Separating labels from features...")

features_list = df_prep.drop(columns=['is_fraud']).columns.tolist()
target_variable = 'is_fraud'
X = df_prep.loc[:,features_list]
Y = df_prep.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

In [None]:
# Y.value_counts(normalize=True)
# # Visualize class distribution
# Y.value_counts(normalize=True).plot(kind='pie', title='Class Distribution', legend=True)
# plt.show()

plt.figure(figsize=(4, 3))
colors = ['#4ecdc4','#ff6b6b']  # Rouge pour fraude, turquoise pour non-fraude
explode = (0.05, 0)  # Légèrement séparer la première tranche

plt.pie(Y.value_counts().values, 
        labels=['Fraude' if val else 'Non-fraude' for val in Y.value_counts().index],
        autopct='%1.1f%%',  # Afficher les pourcentages avec 1 décimale
        startangle=0,
        colors=colors,
        explode=explode)

plt.axis('equal')  # Assure que le camembert est circulaire
plt.tight_layout()
plt.show()

In [None]:
## Statify splitting when you're training a classification model !
X_train, X_test, Y_train, Y_test = train_test_split(X ,Y ,test_size = 0.2, stratify = Y, random_state = 42)

In [None]:
numeric_features = []
categorical_features = []
# Assurez-vous que X est votre DataFrame
for col_name in X.columns:
    dtype = X[col_name].dtype
    if dtype in ['int64', 'float64', 'int32', 'float32']:
        numeric_features.append(col_name)
    elif dtype == 'bool':
        numeric_features.append(col_name)  # Ou categorical_features.append(col_name) selon le traitement souhaité
    else:  # pour 'object' et d'autres types non numériques
        categorical_features.append(col_name)

print('Found numeric features:', numeric_features)
print('Found categorical features:', categorical_features)

In [None]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()), 
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! 
print('...Done.')
print(X_test[0:5,:])

In [None]:
weights = {1:9,0:1}
classifier = LogisticRegression(class_weight=weights, C=15, max_iter=100)
# classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train, Y_train)

In [None]:
# The method .score() from LogisticRegression() returns the accuracy!
print('Train accuracy : {}\nTest accuracy : {}'.format(classifier.score(X_train, Y_train), classifier.score(X_test, Y_test)))

In [None]:
scores = cross_val_score(classifier,X_train, Y_train, cv=10)
avg = scores.mean()
std = scores.std()
print('Cross-validated accuracy : {}\nstandard deviation : {}'.format(avg, std))

In [None]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred[:5])
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = classifier.predict_proba(X_train)
print("...Done.")
print(Y_train_proba[:5])
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred[:5])
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = classifier.predict_proba(X_test)
print("...Done.")
print(Y_test_proba[:5])
print()

In [None]:
# Visualize confusion matrices
_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Train set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(classifier, X_train, Y_train, ax=ax, normalize='true') # ConfusionMatrixDisplay from sklearn
# taille de la figure
plt.show()

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Test set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(classifier, X_test, Y_test, ax=ax, normalize='true') # ConfusionMatrixDisplay from sklearn
plt.show()

In [None]:
# le jeu de données est déséquilibré car pas beaucoup de fraudes représentées dans les datas.
# on va essayer d'améliorer les performances en mettant des poids 

# recherche des hyperparamètre
# Perform grid search
print("Grid search...")
weights = {1:9,0:1}
classifier = LogisticRegression(class_weight=weights)

# Grid of values to be tested
params = {
    'C': [1, 5, 10, 15, 20],
    'max_iter' : [100, 500, 1000]
}
gridsearch = GridSearchCV(classifier, param_grid = params, cv = 3, verbose=2) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)

In [None]:
# Visualize confusion matrices
_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Train set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(gridsearch, X_train, Y_train, ax=ax, normalize='true') # ConfusionMatrixDisplay from sklearn
# taille de la figure
plt.show()

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Test set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(gridsearch, X_test, Y_test, ax=ax, normalize='true') # ConfusionMatrixDisplay from sklearn
plt.show()

In [None]:
forest_classifier = RandomForestClassifier(n_estimators=10, min_samples_split=2)
forest_classifier.fit(X_train,Y_train)

In [None]:
scores = cross_val_score(forest_classifier,X_train, Y_train, cv=10)
avg = scores.mean()
std = scores.std()
print('Cross-validated accuracy : {}\nstandard deviation : {}'.format(avg, std))

In [None]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = forest_classifier.predict(X_train)
print("...Done.")
print(Y_train_pred[:5])
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = forest_classifier.predict_proba(X_train)
print("...Done.")
print(Y_train_proba[:5])
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = forest_classifier.predict(X_test)
print("...Done.")
print(Y_test_pred[:5])
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = forest_classifier.predict_proba(X_test)
print("...Done.")
print(Y_test_proba[:5])
print()

In [None]:
# Visualize confusion matrices
_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Train set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(forest_classifier, X_train, Y_train, ax=ax, normalize='true') # ConfusionMatrixDisplay from sklearn
plt.show()

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Test set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(forest_classifier, X_test, Y_test, ax=ax, normalize='true') # ConfusionMatrixDisplay from sklearn
plt.show()