In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
sys.path.append("../")  # go one directory up (from notebooks/ to project root)
from src.preprocessing import feature_engineer, load_and_split_data, handle_imbalance_smote, scale_data
import joblib

Load the dataset

In [8]:
data_path = '../data/creditcard.csv'
df = pd.read_csv(data_path)

Feature Engineering

In [9]:
df_engineered = feature_engineer(df.copy())

Splitting the data into train and testing

In [10]:
X = df_engineered.drop('Class', axis=1)
y = df_engineered['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Scaling the data

In [11]:
X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)

Handling class imbalance using SMOTE

In [12]:
X_train_resampled, y_train_resampled = handle_imbalance_smote(X_train_scaled, y_train)

Saving the processed data and scaler

In [15]:
joblib.dump(X_train_resampled, '../Data/X_train_resampled.joblib')
joblib.dump(y_train_resampled, '../Data/y_train_resampled.joblib')
joblib.dump(X_test_scaled, '../Data/X_test_scaled.joblib')
joblib.dump(y_test, '../Data/y_test.joblib')
joblib.dump(scaler, '../Models/scaler.joblib')

print("Processed training data saved to ../Data/")
print("Processed testing data saved to ../Data/")
print("Scaler saved to ../Models/")

Processed training data saved to ../Data/
Processed testing data saved to ../Data/
Scaler saved to ../Models/
