In [1]:
import pandas as pd
from sklearn.utils import resample

# Load dataset
df = pd.read_csv("dataset_fraud.csv")

# Target column
target_col = "isFraud"

# Separate classes
fraud = df[df[target_col] == 1]
non_fraud = df[df[target_col] == 0]

print("Original distribution:")
print(df[target_col].value_counts())

# Undersample majority class
non_fraud_downsampled = resample(
    non_fraud,
    replace=False,
    n_samples=len(fraud),
    random_state=42
)

# Combine
balanced_df = pd.concat([fraud, non_fraud_downsampled])

# Shuffle
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nBalanced distribution:")
print(balanced_df[target_col].value_counts())

# Save CSV
balanced_df.to_csv("balanced_fraud_dataset.csv", index=False)

print("\nBalanced dataset saved as balanced_fraud_dataset.csv")

Original distribution:
isFraud
0    6354407
1       8213
Name: count, dtype: int64

Balanced distribution:
isFraud
0    8213
1    8213
Name: count, dtype: int64

Balanced dataset saved as balanced_fraud_dataset.csv
