**PREPROCESSING**


Drop columns we don't need


In [None]:
col_to_drop = ['cc_num','trans_num','first','last','street']

df_train = df_train.drop(columns=col_to_drop)
df_test = df_test.drop(columns=col_to_drop)

print(f" Dropped {len(col_to_drop)} columns")
print("test columns : \n \n",df_test.columns)
print("\n \n train columns : \n \n",df_train.columns)

Add / Modify columns so that our model can understand the data

**TIME DATA**

In [None]:
# Add / Modify columns so that our model can understand the data 

# ----------------------------------------------------
#                           TIME 
# ----------------------------------------------------

# Convert to datetime type 
df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'])
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'])


# Get hours (0 to 23)
df_train['hour'] = df_train['trans_date_trans_time'].dt.hour
df_test['hour'] = df_test['trans_date_trans_time'].dt.hour

# Get day of week (0 = Monday / 6= Sunday)
df_train['day_of_week'] = df_train['trans_date_trans_time'].dt.dayofweek
df_test['day_of_week'] = df_test['trans_date_trans_time'].dt.dayofweek

# Get day of month ( 1 to 31)
df_train['day_of_month'] = df_train['trans_date_trans_time'].dt.day
df_test['day_of_month'] = df_test['trans_date_trans_time'].dt.day

# Get month (1 to 12)
df_train['month'] = df_train['trans_date_trans_time'].dt.month
df_test['month'] = df_test['trans_date_trans_time'].dt.month

# Weekend or not ? 
df_train['is_weekend'] = (df_train['day_of_week'] >= 5).astype(int)
df_test['is_weekend'] = (df_test['day_of_week'] >= 5).astype(int)

# Function to categrize time of day
def get_time_period(hour):
    """ 
    Categorize hour into time periods
    0: Morning (low fraud risk)
    1: Afternoon (low fraud risk)
    2: Evening (moderate fraud risk)
    3: Night (high fraud risk)
    """
    if 5 <= hour < 12:
        return 0
    elif 12 <= hour < 18:
        return 1
    elif 18 <= hour < 23:
        return 2
    else:
        return 3

# Apply 
df_train['time_period'] = df_train['hour'].apply(get_time_period)
df_test['time_period'] = df_test['hour'].apply(get_time_period)

# Analysis 
print("\n Fraud transaction by hour: ")
fraud_by_hour = df_train[df_train['is_fraud'] == 1].groupby('hour').size()
print(fraud_by_hour.sort_values(ascending=False).head(5))

# Distribution analysis
print("\n Distribution of 'hour' (fraud vs legitimate):")
print("\nFraud transactions by hour:")
fraud_by_hour = df_train[df_train['is_fraud'] == 1].groupby('hour').size()
print(fraud_by_hour.sort_values(ascending=False).head(5))

print("\n Fraud rate by time period:")
for period, name in [(0, 'Morning'), (1, 'Afternoon'), (2, 'Evening'), (3, 'Night')]:
    total = (df_train['time_period'] == period).sum()
    frauds = ((df_train['time_period'] == period) & (df_train['is_fraud'] == 1)).sum()
    rate = (frauds / total * 100) if total > 0 else 0
    print(f"   {name}: {frauds:,} frauds / {total:,} transactions = {rate:.3f}%")

**DISTANCE DATA**

In [None]:

geographic_columns = ['lat','long','merch_lat','merch_long']

# Compute the distance between client and merchant
diff_lat_train = df_train['lat'] - df_train['merch_lat']
diff_long_train = df_train['long'] - df_train['merch_long']

# Pythagorean theorem (in degrees because of coordinates)
dist_degree_train = np.sqrt(diff_lat_train**2 + diff_long_train**2)
dist_km_train = dist_degree_train * 111   # Approximation, 1° is about 111km 
df_train['distance_km'] = dist_km_train


# Same for test dataset 
diff_lat_test = df_test['lat'] - df_test['merch_lat']
diff_long_test = df_test['long'] - df_test['merch_long']
dist_degree_test = np.sqrt(diff_lat_test**2 + diff_long_test**2)
dist_km_test = dist_degree_test * 111  
df_test['distance_km'] = dist_km_test

# Overall statistics
print("\n Distance statistics (TRAIN):")
print(f"   Mean: {df_train['distance_km'].mean():.2f} km")
print(f"   Median: {df_train['distance_km'].median():.2f} km")
print(f"   Std Dev: {df_train['distance_km'].std():.2f} km")
print(f"   Min: {df_train['distance_km'].min():.2f} km")
print(f"   Max: {df_train['distance_km'].max():.2f} km")
print(f"  ligne 2: {df_train['distance_km'][1]}")
print(f" Merchant : {df_train['merchant'][1]}")

# Percentiles
print("\n Distance percentiles:")
percentiles = [25, 50, 75, 90, 95, 99]
for p in percentiles:
    value = df_train['distance_km'].quantile(p/100)
    print(f"   {p}th percentile: {value:.2f} km")

# Distance by fraud status
print("\n Distance by class:")
for fraud_status in [0, 1]:
    label = "Legitimate" if fraud_status == 0 else "Fraud"
    subset = df_train[df_train['is_fraud'] == fraud_status]['distance_km']
    print(f"\n   {label} transactions:")
    print(f"      Mean: {subset.mean():.2f} km")
    print(f"      Median: {subset.median():.2f} km")
    print(f"      75th percentile: {subset.quantile(0.75):.2f} km")
    print(f"      95th percentile: {subset.quantile(0.95):.2f} km")

# Distance categories
print("\n Distance categories distribution:")
def categorize_distance(dist):
    if dist < 10:
        return "Very Close (<10 km)"
    elif dist < 50:
        return "Close (10-50 km)"
    elif dist < 200:
        return "Regional (50-200 km)"
    elif dist < 500:
        return "Far (200-500 km)"
    else:
        return "Very Far (>500 km)"

df_train['distance_category'] = df_train['distance_km'].apply(categorize_distance)

for category in ["Very Close (<10 km)", "Close (10-50 km)", "Regional (50-200 km)", 
                "Far (200-500 km)", "Very Far (>500 km)"]:
    total = (df_train['distance_category'] == category).sum()
    frauds = ((df_train['distance_category'] == category) & (df_train['is_fraud'] == 1)).sum()
    fraud_rate = (frauds / total * 100) if total > 0 else 0
    print(f"   {category}: {frauds:,} frauds / {total:,} = {fraud_rate:.3f}%")

# ============================================================================
# 4. VISUALIZATIONS
# ============================================================================

print("\n" + "="*80)
print("4. CREATING VISUALIZATIONS")
print("="*80)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Distance distribution (histogram)
axes[0, 0].hist(df_train['distance_km'], bins=100, color='steelblue', 
               edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Distance (km)', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
axes[0, 0].set_title('Distance Distribution (All Transactions)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlim(0, df_train['distance_km'].quantile(0.99))  # Remove extreme outliers
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Distance distribution by fraud status (overlay)
for fraud_status, label, color in [(0, 'Legitimate', 'green'), (1, 'Fraud', 'red')]:
    subset = df_train[df_train['is_fraud'] == fraud_status]['distance_km']
    axes[0, 1].hist(subset, bins=100, alpha=0.6, label=label, color=color, edgecolor='black')

axes[0, 1].set_xlabel('Distance (km)', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Frequency', fontsize=11, fontweight='bold')
axes[0, 1].set_title('Distance Distribution by Class', fontsize=12, fontweight='bold')
axes[0, 1].legend(fontsize=11)
axes[0, 1].set_xlim(0, df_train['distance_km'].quantile(0.99))
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Boxplot by fraud status
df_train.boxplot(column='distance_km', by='is_fraud', ax=axes[1, 0])
axes[1, 0].set_xlabel('Class', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('Distance (km)', fontsize=11, fontweight='bold')
axes[1, 0].set_title('Distance by Class (Boxplot)', fontsize=12, fontweight='bold')
axes[1, 0].get_figure().suptitle('')
plt.sca(axes[1, 0])
plt.xticks([1, 2], ['Legitimate', 'Fraud'])
axes[1, 0].set_ylim(0, df_train['distance_km'].quantile(0.99))

# Plot 4: Fraud rate by distance category
distance_cats = ["Very Close (<10 km)", "Close (10-50 km)", "Regional (50-200 km)", 
                "Far (200-500 km)", "Very Far (>500 km)"]
fraud_rates = []
for cat in distance_cats:
    total = (df_train['distance_category'] == cat).sum()
    frauds = ((df_train['distance_category'] == cat) & (df_train['is_fraud'] == 1)).sum()
    fraud_rates.append((frauds / total * 100) if total > 0 else 0)

axes[1, 1].bar(range(len(distance_cats)), fraud_rates, color='crimson', 
              edgecolor='black', linewidth=1.5)
axes[1, 1].set_xlabel('Distance Category', fontsize=11, fontweight='bold')
axes[1, 1].set_ylabel('Fraud Rate (%)', fontsize=11, fontweight='bold')
axes[1, 1].set_title('Fraud Rate by Distance Category', fontsize=12, fontweight='bold')
axes[1, 1].set_xticks(range(len(distance_cats)))
axes[1, 1].set_xticklabels(['<10km', '10-50km', '50-200km', '200-500km', '>500km'], 
                           rotation=45, ha='right')
axes[1, 1].grid(True, alpha=0.3, axis='y')

# Add values on bars
for i, rate in enumerate(fraud_rates):
    axes[1, 1].text(i, rate, f'{rate:.3f}%', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('geographic_distance_analysis.png', dpi=300, bbox_inches='tight')
print("\n Visualization saved: geographic_distance_analysis.png")
plt.show()

# Drop temporary category column
df_train = df_train.drop(columns=['distance_category'])

# ============================================================================
# 5. SUMMARY
# ============================================================================


print("\n What we created:")
print("   • distance_km: Euclidean distance between customer and merchant")

print(f"\n Key findings:")
print(f"   • Mean distance (Legitimate): {df_train[df_train['is_fraud']==0]['distance_km'].mean():.2f} km")
print(f"   • Mean distance (Fraud): {df_train[df_train['is_fraud']==1]['distance_km'].mean():.2f} km")

print(f"\n Current shapes:")
print(f"   TRAIN: {df_train.shape}")
print(f"   TEST: {df_test.shape}")

**DEMOGRAPHIC DATA (AGE)**

In [None]:

# Convert to datetime
df_train['dob_datetime'] = pd.to_datetime(df_train['dob'])
df_test['dob_datetime'] = pd.to_datetime(df_test['dob'])

# Compute age (in years)
df_train['age'] = (df_train['trans_datetime'] - df_train['dob_datetime']).dt.days / 365.25
df_test['age'] = (df_test['trans_datetime'] - df_test['dob_datetime']).dt.days / 365.25

# Clip to make the data more realistic 
df_train['age'] = df_train['age'].clip(lower=18,upper=100)
df_test['age'] = df_test['age'].clip(lower=18,upper=100)


df_train = df_train.drop(columns=['dob'])
df_test = df_test.drop(columns=['dob'])




def categorize_age(age):
    if 18 <= age <= 25:
        return "Young (18-25)"
    if 25 < age <= 50:
        return "Adults (25-50)"
    else: 
        return "Old (>50)"
    
# Apply 
df_train['age_category'] = df_train['age'].apply(categorize_age)




print("Age statistics :")
print(df_train['age'].describe())

print("\nDistribution :")
print(df_train['age_category'].value_counts())

print("\nMean age for each category :")
print(df_train.groupby('is_fraud')['age'].agg(['mean', 'median', 'std']).round(2))

print("\nFraud rate by category :")
fraud_rate = df_train.groupby('age_category')['is_fraud'].agg(['mean']) * 100
print(fraud_rate.round(2))

# Drop columns we don't need anymore 
columns_to_drop = ['trans_date_trans_time']

df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

fraud_by_cat = df_train.groupby('age_category')['is_fraud'].mean() * 100
fraud_by_cat.plot(kind='bar', color=['#ff9999', '#66b3ff', '#99ff99'])
plt.title("Taux de fraude (%) par catégorie d'âge")
plt.ylabel("Taux de fraude (%)")
plt.xticks(rotation=45)
plt.show()


**OTHER DATA ENCODING**

In [None]:


# Identify categorical columns 
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()

print(f"\n Categorical columns : {len(categorical_columns)}")
for col in categorical_columns:
    nunique = df_train[col].nunique()
    print(f" •{col}: {nunique:,} unique values")


# Gender (M/F -> 1/0) (Label Encoding)
if 'gender' in df_train.columns:
    le_gender = LabelEncoder()
    df_train['gender'] = le_gender.fit_transform(df_train['gender'])
    df_test['gender'] = le_gender.transform(df_test['gender'])

    print(f"\n Gender encoded:")
    print(f"{le_gender.classes_[0]} -> 0")
    print(f"{le_gender.classes_[1]} -> 1")

    print(f"\n After encoding:")
    print(df_train['gender'].value_counts())
else:
    print("\n 'gender' column not found")


# Target Encoding for high cardinality columns
high_card_cols = ['category', 'merchant', 'job']

for col in high_card_cols:
    if col in df_train.columns:
        fraud_rates = df_train.groupby(col)['is_fraud'].mean()
        global_mean = df_train['is_fraud'].mean()

        df_train[f'{col}_encoded'] = df_train[col].map(fraud_rates)
        df_test[f'{col}_encoded'] = df_test[col].map(fraud_rates).fillna(global_mean)

        # Show top 5 encoded values
        print(f"   Top 5 {col} by fraud rate:")
        top_fraud_rates = fraud_rates.sort_values(ascending=False).head(5)
        for cat, rate in top_fraud_rates.items():
            print(f"      {cat}: {rate:.4f}")

        # Drop original column
        df_train = df_train.drop(columns=[col])
        df_test = df_test.drop(columns=[col])


# State encoding
if 'state' in df_train.columns:
    n_states = df_train['state'].nunique()

    if n_states <= 60:
        # One-hot encoding
        df_train = pd.get_dummies(df_train, columns=['state'], prefix='state', drop_first=False)
        df_test = pd.get_dummies(df_test, columns=['state'], prefix='state', drop_first=False)

        # Align columns (in case test has different states)
        train_cols = set(df_train.columns)
        test_cols = set(df_test.columns)

        # Add missing columns to test
        for col in train_cols - test_cols:
            df_test[col] = 0

        # Remove extra columns from test
        for col in test_cols - train_cols:
            df_test = df_test.drop(columns=[col])

        # Reorder test columns to match train
        df_test = df_test[df_train.columns]
    else:
        # Target encode if too many states 
        fraud_rates = df_train.groupby('state')['is_fraud'].mean()
        global_mean = df_train['is_fraud'].mean()

        df_train['state_encoded'] = df_train['state'].map(fraud_rates)
        df_test['state_encoded'] = df_test['state'].map(fraud_rates).fillna(global_mean)

        df_train = df_train.drop(columns=['state'])
        df_test = df_test.drop(columns=['state'])


# Check for any remaining object columns
remaining_cats = df_train.select_dtypes(include=['object']).columns.tolist()

if remaining_cats:
    print(f"\n Found {len(remaining_cats)} remaining categorical columns:")
    for col in remaining_cats:
        nunique = df_train[col].nunique()
        print(f"   • {col}: {nunique:,} unique values")

    print("\n Applying Target Encoding to remaining columns...")
    for col in remaining_cats:
        fraud_rates = df_train.groupby(col)['is_fraud'].mean()
        global_mean = df_train['is_fraud'].mean()

        df_train[f'{col}_encoded'] = df_train[col].map(fraud_rates)
        df_test[f'{col}_encoded'] = df_test[col].map(fraud_rates).fillna(global_mean)

        df_train = df_train.drop(columns=[col])
        df_test = df_test.drop(columns=[col])


**VERIFICATION**

In [None]:

# Check data types
print(f"\n Data types after encoding:")
print(df_train.dtypes.value_counts())

# Verify no object columns remain
object_cols = df_train.select_dtypes(include=['object']).columns.tolist()
if object_cols:
    print(f"\n WARNING: {len(object_cols)} object columns still present: {object_cols}")
else:
    print(f"\All columns are numeric!")

print(f"\n Current shapes:")
print(f"   TRAIN: {df_train.shape}")
print(f"   TEST: {df_test.shape}")

print(f"\n Current columns ({len(df_train.columns)}):")
print(df_train.columns.tolist()[:20])  # Show first 20
if len(df_train.columns) > 20:
    print(f"   ... and {len(df_train.columns) - 20} more")


**CORRECTIONS**

In [None]:
# 1. Remove remaining datetime columns
datetime_cols = df_train.select_dtypes(include=['datetime64']).columns.tolist()
if datetime_cols:
    print(f"\n Removing {len(datetime_cols)} datetime columns:")
    for col in datetime_cols:
        print(f"    {col}")
    df_train = df_train.drop(columns=datetime_cols)
    df_test = df_test.drop(columns=datetime_cols)
    print("    Datetime columns removed")
else:
    print("\n No datetime columns to remove")

# 2. Convert bool columns to int (for state_* columns)
bool_cols = df_train.select_dtypes(include=['bool']).columns.tolist()
if bool_cols:
    print(f"\n Converting {len(bool_cols)} boolean columns to int:")
    df_train[bool_cols] = df_train[bool_cols].astype(int)
    df_test[bool_cols] = df_test[bool_cols].astype(int)
    print(f"    {len(bool_cols)} columns converted (bool -> int)")
else:
    print("\n No boolean columns to convert")

# 3. Verify all columns are numeric

print(f"\n Final data types:")
print(df_train.dtypes.value_counts())

# Check for non-numeric columns
non_numeric = df_train.select_dtypes(exclude=['int64', 'int32', 'float64']).columns.tolist()
if non_numeric:
    print(f"\n WARNING: {len(non_numeric)} non-numeric columns found:")
    for col in non_numeric:
        print(f"   • {col}: {df_train[col].dtype}")
else:
    print(f"\n SUCCESS: All columns are numeric!")

print(f"\n Final shapes:")
print(f"   TRAIN: {df_train.shape}")
print(f"   TEST: {df_test.shape}")

# Show column summary
print(f"\n Column breakdown:")
print(f"   • Target variable: is_fraud")
print(f"   • Numeric features: {df_train.shape[1] - 1}")
print(f"   • Total columns: {df_train.shape[1]}")


**CHECK IF THERE IS DATA LEAKAGE**

In [None]:
if 'merchant' in df_train.columns:
    fraud_merchants = df_train['merchant'].str.contains('fraud', case=False, na=False).sum()
    print(f"   Merchants with 'fraud' in name: {fraud_merchants}")
    if fraud_merchants > 0:
        print("    WARNING: Potential data leakage!")

**FEATURE SCALING**

In [None]:
# Separate features and target
X_train = df_train.drop('is_fraud', axis=1)
y_train = df_train['is_fraud']

X_test = df_test.drop('is_fraud', axis=1)
y_test = df_test['is_fraud']

print(f"\n Separated features and target:")
print(f"   X_train: {X_train.shape}")
print(f"   y_train: {y_train.shape}")
print(f"   X_test: {X_test.shape}")
print(f"   y_test: {y_test.shape}")


# Get all numeric columns
all_cols = X_train.columns.tolist()

# Identify binary columns (0/1 only) - don't need scaling
binary_cols = []
for col in all_cols:
    unique_vals = X_train[col].nunique()
    if unique_vals == 2:
        vals = set(X_train[col].unique())
        if vals.issubset({0, 1, 0.0, 1.0}):
            binary_cols.append(col)

print(f"   Binary columns (no scaling needed): {len(binary_cols)}")
print(f"   Examples: {binary_cols[:5]}")

# Columns to scale: all except binary
cols_to_scale = [col for col in all_cols if col not in binary_cols]
print(f"\n   Columns to scale: {len(cols_to_scale)}")
print(f"   Examples: {cols_to_scale[:10]}")

# Show before scaling (first 3 rows, selected columns)
print(f"\n Before scaling (sample):")
sample_cols = ['amt', 'lat', 'age', 'distance_km']
existing_sample_cols = [col for col in sample_cols if col in X_train.columns]
print(X_train[existing_sample_cols].head(3))

# Initialize and fit scaler on TRAIN data only
print("\n Fitting StandardScaler on TRAIN data...")
scaler = StandardScaler()
scaler.fit(X_train[cols_to_scale])

# Transform both train and test
X_train[cols_to_scale] = scaler.transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])


# Show after scaling
print(f"\n After scaling (same sample):")
print(X_train[existing_sample_cols].head(3))

print("\n Verify scaling (should be ~0 mean, ~1 std):")
for col in existing_sample_cols:
    if col in cols_to_scale:
        print(f"   {col}: mean={X_train[col].mean():.6f}, std={X_train[col].std():.6f}")

**TRAIN / VALIDATION SPLIT**

In [None]:
# Split train into train_final and validation
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, 
    y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train  # CRITICAL: maintain class distribution
)

print(f"\n Split completed:")
print(f"   Train (final): {X_train_final.shape} - for training models")
print(f"   Validation:    {X_val.shape} - for hyperparameter tuning")
print(f"   Test:          {X_test.shape} - for final evaluation (DON'T TOUCH!)")

# Verify class distribution
print(f"\n Class distribution verification:")
train_fraud_rate = y_train_final.mean() * 100
val_fraud_rate = y_val.mean() * 100
test_fraud_rate = y_test.mean() * 100

print(f"   Train:      {y_train_final.sum():,} frauds / {len(y_train_final):,} = {train_fraud_rate:.3f}%")
print(f"   Validation: {y_val.sum():,} frauds / {len(y_val):,} = {val_fraud_rate:.3f}%")
print(f"   Test:       {y_test.sum():,} frauds / {len(y_test):,} = {test_fraud_rate:.3f}%")

if abs(train_fraud_rate - val_fraud_rate) < 0.01 and abs(train_fraud_rate - test_fraud_rate) < 0.01:
    print("    Distributions are consistent!")
else:
    print("    Warning: Distributions differ slightly")

**HANDLING OF CLASS IMBALANCE (USING SMOTE)**

In [None]:
print("\n The Problem:")
print(f"   Current ratio: ~{int(1/train_fraud_rate*100)}:1 (legitimate:fraud)")
print("   Without balancing:")
print("   • Model will predict 'legitimate' for everything")
print("   • Accuracy = 99.4% but 0 frauds detected!")
print("   • Useless in production")

print("\n Solution: SMOTE (Synthetic Minority Over-sampling Technique)")
print("   • Creates synthetic fraud examples by interpolation")
print("   • Interpolates between existing fraud transactions")
print("   • sampling_strategy=0.3 → target ratio 3:1 (not 1:1, too artificial)")

print("\n CRITICAL RULES:")
print("   1. Apply SMOTE ONLY on train_final (NOT on validation/test!)")
print("   2. Never touch validation/test sets (data leakage!)")
print("   3. Validation/test must reflect real-world distribution")

# Show before SMOTE
print(f"\n BEFORE SMOTE:")
print(f"   Class 0 (legitimate): {(y_train_final == 0).sum():,}")
print(f"   Class 1 (fraud):      {(y_train_final == 1).sum():,}")
print(f"   Ratio: {(y_train_final == 0).sum() / (y_train_final == 1).sum():.1f}:1")

# Apply SMOTE
print("\n Applying SMOTE...")
smote = SMOTE(
    sampling_strategy=0.3,  # Target: 30% fraud (ratio ~3:1)
    random_state=42,
    k_neighbors=5
)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_final, y_train_final)


# Show after SMOTE
print(f"\n AFTER SMOTE:")
print(f"   Class 0 (legitimate): {(y_train_resampled == 0).sum():,}")
print(f"   Class 1 (fraud):      {(y_train_resampled == 1).sum():,}")
print(f"   Ratio: {(y_train_resampled == 0).sum() / (y_train_resampled == 1).sum():.1f}:1")

print(f"\n Size changes:")
print(f"   Before: {X_train_final.shape[0]:,} samples")
print(f"   After:  {X_train_resampled.shape[0]:,} samples")
print(f"   Added:  {X_train_resampled.shape[0] - X_train_final.shape[0]:,} synthetic frauds")

# Convert back to DataFrame (SMOTE returns numpy arrays)
X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train_final.columns)
y_train_resampled = pd.Series(y_train_resampled, name='is_fraud')

