In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

df = pd.read_csv('../data/processed/fraud_data_processed.csv')

In [4]:
# 1. Define Features and Target
# IMPORTANT: Added 'lower_bound_ip_address' and 'upper_bound_ip_address' to the drop list
X = df.drop(columns=[
    'class', 'user_id', 'device_id', 'signup_time', 
    'purchase_time', 'country', 'lower_bound_ip_address', 'upper_bound_ip_address'
])
y = df['class']

# 2. One-Hot Encoding (for categorical variables)
X = pd.get_dummies(X, columns=['source', 'browser', 'sex'], drop_first=True)

# 3. Final Safety Net: Check for any remaining NaNs (should be 0 now)
if X.isna().any().any():
    print("Found NaNs! Dropping rows...")
    X = X.dropna()
    y = y[X.index]

# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Apply SMOTE to Training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Final training set size: {X_train_resampled.shape}")
print("Class distribution after SMOTE:", pd.Series(y_train_resampled).value_counts())

Final training set size: (219136, 15)
Class distribution after SMOTE: class
0    109568
1    109568
Name: count, dtype: int64


In [5]:
# --- Final Task 1 Documentation: Class Imbalance Analysis ---
print("--- Class Distribution Before Resampling ---")
print(y_train.value_counts())
print(f"Fraud Percentage: {y_train.mean() * 100:.2f}%")

print("\n--- Class Distribution After SMOTE ---")
print(pd.Series(y_train_resampled).value_counts())



--- Class Distribution Before Resampling ---
class
0    109568
1     11321
Name: count, dtype: int64
Fraud Percentage: 9.36%

--- Class Distribution After SMOTE ---
class
0    109568
1    109568
Name: count, dtype: int64


# Justification for SMOTE
I applied SMOTE to the training data to address the extreme class imbalance (approx 9%). 
SMOTE is preferred over simple undersampling here to avoid losing valuable legitimate 
transaction patterns while synthesizing new fraud examples to help the model learn 
a clearer decision boundary.