In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
train_df = pd.read_parquet("preprocessed_train_data.parquet")
test_df = pd.read_parquet("preprocessed_test_data.parquet")


In [9]:
# Check dataset sizes
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

Train set shape: (3888468, 11)
Test set shape: (747411, 11)


In [10]:
# Visualizing class distribution in Train Set
plt.figure(figsize=(6, 4))
sns.countplot(x="FlagImpaye", data=train_df, palette="Set2")
plt.title("Class Distribution in Train Set")
plt.xlabel("Transaction Status (0 = Non-Fraud, 1 = Fraud)")
plt.ylabel("Proportion")

# Adding percentage labels
total_train = len(train_df)
for p in plt.gca().patches:
    percentage = f'{100 * p.get_height() / total_train:.2f}%'
    plt.gca().annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha='center', va='center', fontsize=12, color='black', xytext=(0, 10),
                       textcoords='offset points')
plt.show()

🔹 Distribution des classes dans Train:
FlagImpaye
0    0.993996
1    0.006004
Name: proportion, dtype: float64

🔹 Distribution des classes dans Test:
FlagImpaye
0    0.991206
1    0.008794
Name: proportion, dtype: float64


In [None]:
# Visualizing class distribution in Test Set
plt.figure(figsize=(6, 4))
sns.countplot(x="FlagImpaye", data=test_df, palette="Set3")
plt.title("Class Distribution in Test Set")
plt.xlabel("Transaction Status (0 = Non-Fraud, 1 = Fraud)")
plt.ylabel("Proportion")

# Adding percentage labels
total_test = len(test_df)
for p in plt.gca().patches:
    percentage = f'{100 * p.get_height() / total_test:.2f}%'
    plt.gca().annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha='center', va='center', fontsize=12, color='black', xytext=(0, 10),
                       textcoords='offset points')
plt.show()


Data Preparation

In [5]:
X_train = train_df.drop(columns=["FlagImpaye"])
y_train = train_df["FlagImpaye"]

In [6]:
X_test = test_df.drop(columns=["FlagImpaye"])
y_test = test_df["FlagImpaye"]

1 - Undersampling dataset

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)

print("🔹 Class Distribution after Undersampling:")
print(pd.Series(y_train_under).value_counts(normalize=True))

2 - Oversampling with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE,ADASYN

smote = SMOTE(random_state=42)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

print("🔹 Class Distribution after SMOTE Oversampling:")
print(pd.Series(y_train_over).value_counts(normalize=True))


3 - Hybrid Sampling (SMOTE + Tomek Links)

In [None]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
X_train_hybrid, y_train_hybrid = smote_tomek.fit_resample(X_train, y_train)

print("🔹 Class Distribution after SMOTE + Tomek:")
print(pd.Series(y_train_hybrid).value_counts(normalize=True))

4️ - Oversampling with ADASYN

In [None]:
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

print("🔹 Class Distribution after ADASYN Oversampling:")
print(pd.Series(y_train_adasyn).value_counts(normalize=True))


### Model Training and Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Datasets to experiment with
datasets = {
    "Original": (X_train, y_train),
    "Undersampled": (X_train_under, y_train_under),
    "Oversampled (SMOTE)": (X_train_over, y_train_over),
    "Hybrid (SMOTE + Tomek)": (X_train_hybrid, y_train_hybrid),
    "Oversampled (ADASYN)": (X_train_adasyn, y_train_adasyn),
    "Class Weight Rebalancing": (X_train, y_train)  # Model will handle the rebalancing
}

In [None]:
# Test set
X_test_final = test_df.drop(columns=["FlagImpaye"])
y_test_final = test_df["FlagImpaye"]

In [None]:
# Loop to train and evaluate on each dataset
for name, (X_train_exp, y_train_exp) in datasets.items():
    print(f"\n🔹 Testing Random Forest Model with {name}")

    # Train the model
    if name == "Class Weight Rebalancing":
        model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
    else:
        model = RandomForestClassifier(n_estimators=100, random_state=42)

    model.fit(X_train_exp, y_train_exp)

    # Predictions
    y_pred = model.predict(X_test_final)

    # Evaluation
    print(classification_report(y_test_final, y_pred))


Hyperparameter Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters to test
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid Search
grid_search = GridSearchCV(RandomForestClassifier(class_weight="balanced", random_state=42),
                           param_grid, scoring="f1", cv=3, verbose=2, n_jobs=-1)

# Training Grid Search on the best performing dataset (e.g., SMOTE oversampled)
grid_search.fit(X_train_over, y_train_over)

print("🔹 Best Hyperparameters:", grid_search.best_params_)