In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import joblib

Load data

In [2]:
full_df = pd.read_csv("../data/Fraud.csv")

Drop columns that are identifiers or not useful for modeling

In [3]:
full_df = full_df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)

Identify features and target

In [4]:
X = full_df.drop('isFraud', axis=1)
y = full_df['isFraud']

Feature Engineering

In [5]:
# These new features help identify inconsistencies in transaction balances
X['errorBalanceOrig'] = X['oldbalanceOrg'] - X['newbalanceOrig'] - X['amount']
X['errorBalanceDest'] = X['newbalanceDest'] - X['oldbalanceDest'] - X['amount']

Categorical and numerical columns

In [6]:
cat_cols = ['type']  # Categorical features like transaction type
num_cols = X.drop(columns=cat_cols).columns.tolist()  # All other numeric features

Preprocessing pipeline

In [7]:
# Pipeline for numerical features: fill missing values with the mean
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

# Pipeline for categorical features: one-hot encode
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both into a single preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

Fit and transform

In [8]:
X_processed = preprocessor.fit_transform(X)

Save transformed data and preprocessor

In [9]:
# Save the preprocessor for later use during inference
joblib.dump(preprocessor, '../models/preprocessor.pkl')

# Save transformed features and target for training
pd.DataFrame(X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed).to_csv('../data/processed_features.csv', index=False)
y.to_csv('../data/processed_target.csv', index=False)