# Task 1.3: Feature Engineering for Fraud Detection

## Objective
Create meaningful features that help identify fraud patterns:
1. **Time-based features**: hour_of_day, day_of_week, time_since_signup
2. **Velocity features**: transaction frequency per user in time windows
3. **Data transformations**: scaling and encoding for modeling
4. **Handle class imbalance**: SMOTE/undersampling on training data

## Why These Features Matter
- **Time features**: Fraudsters often operate at unusual hours or create accounts just before fraud
- **Velocity features**: Rapid-fire transactions from one user signal automated fraud
- **Proper encoding/scaling**: Required for many ML algorithms to work correctly

In [None]:
# Standard imports
import sys
from pathlib import Path

# Add project root to path for imports
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Project imports
from src.features.time_features import add_time_features, add_time_period_features
from src.features.velocity import add_velocity_features, add_user_transaction_count

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Data with Country

In [None]:
# Load data with country from previous notebook
DATA_PATH = project_root / "data" / "processed" / "fraud_with_country.parquet"

if DATA_PATH.exists():
    df = pd.read_parquet(DATA_PATH)
    print(f"Loaded data with country: {df.shape}")
else:
    raise FileNotFoundError(f"Please run notebook 02 first to create: {DATA_PATH}")

df.head()

In [None]:
# Check current columns and types
print("Current columns:")
df.info()

## 2. Add Time-Based Features

In [None]:
# Add time features
df = add_time_features(df)

print("New time features added:")
time_cols = ['hour_of_day', 'day_of_week', 'is_weekend', 'time_since_signup']
df[time_cols].head(10)

In [None]:
# Add time period feature (morning/afternoon/evening/night)
df = add_time_period_features(df)

print("Time period distribution:")
print(df['purchase_time_period'].value_counts())

In [None]:
# Check time_since_signup statistics
print("\ntime_since_signup statistics (in seconds):")
print(df['time_since_signup'].describe())

# Convert to more interpretable units
df['time_since_signup_hours'] = df['time_since_signup'] / 3600
df['time_since_signup_days'] = df['time_since_signup'] / 86400

print("\ntime_since_signup (in hours):")
print(df['time_since_signup_hours'].describe())

In [None]:
# Visualize time features vs fraud
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Hour of day fraud rate
fraud_by_hour = df.groupby('hour_of_day')['class'].mean() * 100
axes[0, 0].bar(fraud_by_hour.index, fraud_by_hour.values, color='#e74c3c')
axes[0, 0].axhline(y=df['class'].mean()*100, color='black', linestyle='--', label='Overall')
axes[0, 0].set_xlabel('Hour of Day')
axes[0, 0].set_ylabel('Fraud Rate (%)')
axes[0, 0].set_title('Fraud Rate by Hour of Day')
axes[0, 0].legend()

# Day of week fraud rate
fraud_by_dow = df.groupby('day_of_week')['class'].mean() * 100
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0, 1].bar(days, fraud_by_dow.values, color='#9b59b6')
axes[0, 1].axhline(y=df['class'].mean()*100, color='black', linestyle='--', label='Overall')
axes[0, 1].set_xlabel('Day of Week')
axes[0, 1].set_ylabel('Fraud Rate (%)')
axes[0, 1].set_title('Fraud Rate by Day of Week')
axes[0, 1].legend()

# Time since signup distribution by class
df_sample = df.sample(min(10000, len(df)), random_state=42)
axes[1, 0].boxplot(
    [df_sample[df_sample['class']==0]['time_since_signup_hours'].dropna(),
     df_sample[df_sample['class']==1]['time_since_signup_hours'].dropna()],
    labels=['Non-Fraud', 'Fraud']
)
axes[1, 0].set_ylabel('Time Since Signup (hours)')
axes[1, 0].set_title('Time Since Signup by Class')

# Time period fraud rate
fraud_by_period = df.groupby('purchase_time_period')['class'].mean() * 100
period_order = ['morning', 'afternoon', 'evening', 'night']
fraud_by_period = fraud_by_period.reindex(period_order)
axes[1, 1].bar(fraud_by_period.index, fraud_by_period.values, color='#3498db')
axes[1, 1].axhline(y=df['class'].mean()*100, color='black', linestyle='--', label='Overall')
axes[1, 1].set_xlabel('Time Period')
axes[1, 1].set_ylabel('Fraud Rate (%)')
axes[1, 1].set_title('Fraud Rate by Time Period')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

### Interpretation: Time Features

*TODO: After running, describe:*
- Are there hours with higher fraud rates?
- Does day of week affect fraud rate?
- Is time_since_signup different for fraud vs non-fraud? (This is often a strong signal!)

## 3. Add Velocity Features

In [None]:
# Add user transaction velocity (1 hour and 24 hour windows)
print("Computing velocity features (this may take a moment)...")
df = add_velocity_features(df, user_col='user_id', time_col='purchase_time', windows_hours=[1, 24])

print("\nVelocity features added:")
velocity_cols = ['tx_count_user_id_1h', 'tx_count_user_id_24h']
df[velocity_cols].describe()

In [None]:
# Add total user transaction count
df = add_user_transaction_count(df, user_col='user_id')

print("User total transactions:")
print(df['user_total_transactions'].describe())

In [None]:
# Visualize velocity features vs fraud
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Transaction count in 1h by class
df_sample = df.sample(min(10000, len(df)), random_state=42)

axes[0].boxplot(
    [df_sample[df_sample['class']==0]['tx_count_user_id_1h'],
     df_sample[df_sample['class']==1]['tx_count_user_id_1h']],
    labels=['Non-Fraud', 'Fraud']
)
axes[0].set_ylabel('Transactions in 1 Hour')
axes[0].set_title('User Transaction Velocity (1h) by Class')

# Transaction count in 24h by class
axes[1].boxplot(
    [df_sample[df_sample['class']==0]['tx_count_user_id_24h'],
     df_sample[df_sample['class']==1]['tx_count_user_id_24h']],
    labels=['Non-Fraud', 'Fraud']
)
axes[1].set_ylabel('Transactions in 24 Hours')
axes[1].set_title('User Transaction Velocity (24h) by Class')

# Total user transactions by class
axes[2].boxplot(
    [df_sample[df_sample['class']==0]['user_total_transactions'],
     df_sample[df_sample['class']==1]['user_total_transactions']],
    labels=['Non-Fraud', 'Fraud']
)
axes[2].set_ylabel('Total User Transactions')
axes[2].set_title('Total User Transactions by Class')

plt.tight_layout()
plt.show()

In [None]:
# Statistical comparison
print("Velocity Features by Class:")
print(df.groupby('class')[velocity_cols + ['user_total_transactions']].mean())

### Interpretation: Velocity Features

*TODO: After running, describe:*
- Do fraudulent users have higher transaction velocity?
- Is there a difference in total transactions per user?
- What velocity thresholds might indicate fraud?

## 4. Prepare Feature Matrix

In [None]:
# Review all columns
print("All columns after feature engineering:")
print(df.columns.tolist())

In [None]:
# Define feature groups
NUMERIC_FEATURES = [
    'purchase_value',
    'age',
    'hour_of_day',
    'day_of_week',
    'is_weekend',
    'time_since_signup',
    'tx_count_user_id_1h',
    'tx_count_user_id_24h',
    'user_total_transactions'
]

CATEGORICAL_FEATURES = [
    'source',
    'browser',
    'sex',
    'country'
]

TARGET = 'class'

# Columns to exclude from features (identifiers, raw timestamps)
EXCLUDE_COLS = [
    'user_id', 'device_id', 'ip_address',
    'signup_time', 'purchase_time',
    'time_since_signup_hours', 'time_since_signup_days',
    'purchase_time_period',  # Will use hour_of_day instead
    'class'
]

print(f"Numeric features ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f"Categorical features ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")

In [None]:
# Check for missing values in features
feature_cols = NUMERIC_FEATURES + CATEGORICAL_FEATURES
missing = df[feature_cols].isnull().sum()
if missing.any():
    print("Missing values in features:")
    print(missing[missing > 0])
else:
    print("No missing values in selected features.")

In [None]:
# Handle any remaining missing values
# Fill numeric with median
for col in NUMERIC_FEATURES:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled {col} missing with median: {median_val}")

# Fill categorical with 'Unknown'
for col in CATEGORICAL_FEATURES:
    if df[col].isnull().any():
        df[col] = df[col].fillna('Unknown')
        print(f"Filled {col} missing with 'Unknown'")

## 5. Train-Test Split (Stratified)

In [None]:
# Prepare X and y
X = df[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
y = df[TARGET].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Class distribution: {y.value_counts().to_dict()}")

In [None]:
# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining class distribution:")
print(y_train.value_counts())
print(f"\nTest class distribution:")
print(y_test.value_counts())

## 6. Data Transformation (Scaling & Encoding)

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_FEATURES),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATEGORICAL_FEATURES)
    ]
)

# Fit on training data only
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(f"Transformed training shape: {X_train_transformed.shape}")
print(f"Transformed test shape: {X_test_transformed.shape}")

In [None]:
# Get feature names after transformation
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(CATEGORICAL_FEATURES)
all_feature_names = NUMERIC_FEATURES + list(cat_feature_names)

print(f"Total features after encoding: {len(all_feature_names)}")
print(f"\nFirst 20 feature names:")
print(all_feature_names[:20])

## 7. Handle Class Imbalance with SMOTE

In [None]:
# Class distribution BEFORE resampling
print("Class distribution BEFORE SMOTE:")
print(f"  Non-Fraud (0): {(y_train == 0).sum():,}")
print(f"  Fraud (1): {(y_train == 1).sum():,}")
print(f"  Ratio: 1:{(y_train == 0).sum() / (y_train == 1).sum():.1f}")

In [None]:
# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

print("\nClass distribution AFTER SMOTE:")
print(f"  Non-Fraud (0): {(y_train_resampled == 0).sum():,}")
print(f"  Fraud (1): {(y_train_resampled == 1).sum():,}")
print(f"  Ratio: 1:{(y_train_resampled == 0).sum() / (y_train_resampled == 1).sum():.1f}")

In [None]:
# Visualize class distribution before and after
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Before SMOTE
before_counts = [sum(y_train == 0), sum(y_train == 1)]
axes[0].bar(['Non-Fraud', 'Fraud'], before_counts, color=['#2ecc71', '#e74c3c'])
axes[0].set_ylabel('Count')
axes[0].set_title('Before SMOTE')
for i, v in enumerate(before_counts):
    axes[0].text(i, v + 100, f'{v:,}', ha='center', fontweight='bold')

# After SMOTE
after_counts = [sum(y_train_resampled == 0), sum(y_train_resampled == 1)]
axes[1].bar(['Non-Fraud', 'Fraud'], after_counts, color=['#2ecc71', '#e74c3c'])
axes[1].set_ylabel('Count')
axes[1].set_title('After SMOTE')
for i, v in enumerate(after_counts):
    axes[1].text(i, v + 100, f'{v:,}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

### Interpretation: Class Imbalance Handling

*TODO: After running, describe:*
- What was the original imbalance ratio?
- How many synthetic samples were created by SMOTE?
- Why is it important to apply SMOTE only on training data?

## 8. Save Processed Data

In [None]:
# Save the full feature-engineered dataset (before train/test split)
output_path = project_root / "data" / "processed" / "fraud_featured.parquet"
df.to_parquet(output_path, index=False)
print(f"Feature-engineered data saved to: {output_path}")

In [None]:
# Save train/test splits as numpy arrays for modeling
import joblib

# Create models directory if needed
models_dir = project_root / "models"
models_dir.mkdir(exist_ok=True)

# Save preprocessor
joblib.dump(preprocessor, models_dir / "preprocessor.joblib")

# Save data splits
np.save(models_dir / "X_train_resampled.npy", X_train_resampled)
np.save(models_dir / "y_train_resampled.npy", y_train_resampled)
np.save(models_dir / "X_test.npy", X_test_transformed)
np.save(models_dir / "y_test.npy", y_test.values)

# Save feature names
joblib.dump(all_feature_names, models_dir / "feature_names.joblib")

print(f"Saved preprocessor and data splits to: {models_dir}")

## 9. Summary: Task 1 Complete

*TODO: Fill in after completing the analysis*

### Features Created
1. **Time features**: hour_of_day, day_of_week, is_weekend, time_since_signup
2. **Velocity features**: tx_count_user_id_1h, tx_count_user_id_24h, user_total_transactions
3. **Geographic feature**: country (from IP mapping)

### Data Transformations
- Numeric features scaled with StandardScaler
- Categorical features encoded with OneHotEncoder
- Total features after encoding: [number]

### Class Imbalance
- Original ratio: [ratio]
- Applied SMOTE to training data only
- After SMOTE ratio: 1:1

### Key Insights
1. [Insight about time features]
2. [Insight about velocity features]
3. [Insight about geographic patterns]

### Files Saved
- `data/processed/fraud_featured.parquet`: Full feature-engineered dataset
- `models/preprocessor.joblib`: Fitted preprocessor for inference
- `models/X_train_resampled.npy`, `y_train_resampled.npy`: Resampled training data
- `models/X_test.npy`, `y_test.npy`: Test data

### Next Steps
- Proceed to Task 2: Model Building and Training
- Train baseline (Logistic Regression) and ensemble models
- Evaluate using AUC-PR, F1-Score, Confusion Matrix