# 3. Feature Engineering

This notebook focuses on creating meaningful features for our fraud detection models. We'll process both the fraud and credit card datasets using the logic implemented in `src/features/`.

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from src.data.loading import load_fraud_data, load_ip_country_data, load_creditcard_data, save_processed_data
from src.features.engineering import create_all_features, create_time_features, create_time_since_signup
from src.features.geolocation import merge_ip_country, create_country_features
from src.data.preprocessing import create_preprocessing_pipeline

## 1. Fraud Dataset Feature Engineering

In [None]:
# Load raw data
fraud_df = load_fraud_data()
ip_df = load_ip_country_data()

# 1. Merge with Geolocation
merged_df = merge_ip_country(fraud_df, ip_df)

# 2. Create categorical features for country risk
merged_df = create_country_features(merged_df)

# 3. Create time-based features and velocity features
# Note: create_all_features() handles time formats and numeric velocity
fraud_featured = create_all_features(merged_df)

print(f"Final Fraud Features Shape: {fraud_featured.shape}")
fraud_featured.head()

## 2. Credit Card Dataset Feature Engineering

The credit card dataset is mostly pre-processed (PCA). We'll focus on the 'Time' and 'Amount' features.

In [None]:
cc_df = load_creditcard_data()

# The 'Time' feature in the credit card dataset represents seconds elapsed since first transaction
# We could convert it to hours/days if we had a reference date, but for now we'll keep it numerical
cc_featured = cc_df.copy()

print(f"Credit Card Features Shape: {cc_featured.shape}")
cc_featured.head()

## 3. Preprocessing Pipelines

Defining and testing our preprocessing logic (scaling and encoding).

In [None]:
# Example for fraud data
numeric_cols = ['purchase_value', 'age', 'time_since_signup', 'user_txn_count', 'user_avg_amount']
categorical_cols = ['source', 'browser', 'sex', 'country_risk_level']

pipeline = create_preprocessing_pipeline(numeric_cols, categorical_cols, scale_strategy='standard', encoding='onehot')
print("Preprocessing pipeline created successfully.")

## Save Processed Data

In [None]:
# Save featured datasets for training step
save_processed_data(fraud_featured, "fraud_featured.csv")
save_processed_data(cc_featured, "creditcard_featured.csv")

print("Datasets saved successfully to data/processed/")