# Feature Engineering for Fraud Detection

This notebook focuses on:
- Geolocation integration (IP to Country mapping)
- Time-based feature engineering
- Transaction velocity features
- Device features
- Data transformation (scaling, encoding)
- Class imbalance handling (SMOTE)

**Author**: Adey Innovations Inc. Data Science Team  
**Date**: December 2025


## 1. Setup and Data Loading


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
import sys
sys.path.append('..')
from src.data_loader import (
    load_fraud_data, load_ip_to_country, load_creditcard_data,
    map_ip_to_country, clean_fraud_data, clean_creditcard_data,
    get_class_distribution
)
from src.feature_engineering import (
    create_time_features, create_transaction_velocity_features,
    create_device_features, encode_categorical_features,
    scale_numerical_features, prepare_features_for_modeling
)

print("Libraries imported successfully!")


In [None]:
# Load datasets
fraud_df = load_fraud_data('../data/raw/Fraud_Data.csv')
ip_country_df = load_ip_to_country('../data/raw/IpAddress_to_Country.csv')
cc_df = load_creditcard_data('../data/raw/creditcard.csv')

print("Datasets loaded:")
print(f"  - Fraud_Data: {fraud_df.shape}")
print(f"  - IP to Country: {ip_country_df.shape}")
print(f"  - Credit Card: {cc_df.shape}")


## 2. Geolocation Integration (IP to Country)


In [None]:
# Examine IP address data
print("IP Address Sample (Fraud Data):")
print(fraud_df['ip_address'].head())
print(f"\nIP range: {fraud_df['ip_address'].min()} - {fraud_df['ip_address'].max()}")

print("\nIP to Country Sample:")
print(ip_country_df.head())


In [None]:
# Map IP addresses to countries
print("Mapping IP addresses to countries...")
fraud_df_geo = map_ip_to_country(fraud_df, ip_country_df)

print(f"\nMapping complete!")
print(f"Countries found: {fraud_df_geo['country'].nunique()}")
print(f"\nTop 10 countries by transaction count:")
print(fraud_df_geo['country'].value_counts().head(10))


In [None]:
# Analyze fraud by country
fraud_by_country = fraud_df_geo.groupby('country').agg({
    'class': ['count', 'sum', 'mean']
}).round(4)
fraud_by_country.columns = ['Total', 'Fraud_Count', 'Fraud_Rate']
fraud_by_country = fraud_by_country.sort_values('Fraud_Rate', ascending=False)

# Filter to countries with at least 100 transactions
significant_countries = fraud_by_country[fraud_by_country['Total'] >= 100]

print("Fraud Rates by Country (min 100 transactions):")
print(significant_countries.head(15))

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
top_fraud_countries = significant_countries.head(15)
colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(top_fraud_countries)))
ax.barh(range(len(top_fraud_countries)), top_fraud_countries['Fraud_Rate'].values, color=colors)
ax.set_yticks(range(len(top_fraud_countries)))
ax.set_yticklabels(top_fraud_countries.index)
ax.invert_yaxis()
ax.axvline(fraud_df_geo['class'].mean(), color='blue', linestyle='--', label='Overall Rate')
ax.set_xlabel('Fraud Rate')
ax.set_title('Top 15 Countries by Fraud Rate')
ax.legend()
plt.tight_layout()
plt.show()


## 3. Time-Based Feature Engineering


In [None]:
# Create time-based features
fraud_df_fe = create_time_features(fraud_df_geo)

print("Time features created:")
print(fraud_df_fe[['hour_of_day', 'day_of_week', 'is_weekend', 'time_since_signup']].head())
print(f"\nNew columns: hour_of_day, day_of_week, is_weekend, time_since_signup")


## 4. Transaction Velocity and Device Features


In [None]:
# Create velocity features
fraud_df_fe = create_transaction_velocity_features(fraud_df_fe)
print("Velocity features created:")
print(fraud_df_fe[['user_total_transactions', 'user_transaction_number']].describe())

# Create device features
fraud_df_fe = create_device_features(fraud_df_fe)
print("\nDevice features created:")
print(fraud_df_fe[['device_total_transactions', 'device_unique_users']].describe())


## 5. Encoding and Scaling


In [None]:
# Encode categorical features
categorical_cols = ['source', 'browser', 'sex', 'country']
fraud_df_encoded, encoding_info = encode_categorical_features(fraud_df_fe, categorical_cols)

print(f"Original shape: {fraud_df_fe.shape}")
print(f"After encoding: {fraud_df_encoded.shape}")
print(f"\nEncoded columns sample:")
encoded_cols = [c for c in fraud_df_encoded.columns if any(cat in c for cat in categorical_cols)]
print(encoded_cols[:10])


In [None]:
# Prepare features for modeling
X_fraud, y_fraud = prepare_features_for_modeling(
    fraud_df_encoded, 
    target_col='class',
    drop_cols=['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']
)

print(f"Features shape: {X_fraud.shape}")
print(f"Target shape: {y_fraud.shape}")
print(f"\nFeature columns:")
print(X_fraud.columns.tolist())


## 6. Handling Class Imbalance (SMOTE)


In [None]:
# Document class distribution before SMOTE
print("Class Distribution BEFORE SMOTE:")
print("="*50)
print(y_fraud.value_counts())
print(f"\nFraud Rate: {y_fraud.mean()*100:.2f}%")


In [None]:
# Apply SMOTE (demonstration - in practice, apply only to training data)
from sklearn.model_selection import train_test_split

# Split first
X_train, X_test, y_train, y_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining class distribution:")
print(y_train.value_counts())


In [None]:
# Apply SMOTE to training data only
smote = SMOTE(random_state=42, sampling_strategy=0.5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class Distribution AFTER SMOTE:")
print("="*50)
print(pd.Series(y_train_resampled).value_counts())
print(f"\nFraud Rate: {y_train_resampled.mean()*100:.2f}%")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Before SMOTE
axes[0].bar(['Legitimate', 'Fraud'], [len(y_train[y_train==0]), len(y_train[y_train==1])], 
            color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Before SMOTE')
axes[0].set_ylabel('Count')

# After SMOTE
axes[1].bar(['Legitimate', 'Fraud'], 
            [len(y_train_resampled[y_train_resampled==0]), len(y_train_resampled[y_train_resampled==1])],
            color=['#2ecc71', '#e74c3c'])
axes[1].set_title('After SMOTE')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()


## 7. Credit Card Data Preprocessing


In [None]:
# Prepare credit card data
# The V1-V28 features are already PCA transformed
# Only Amount and Time need scaling

from sklearn.preprocessing import StandardScaler

cc_df_processed = cc_df.copy()

# Scale Amount and Time
scaler = StandardScaler()
cc_df_processed['Amount_scaled'] = scaler.fit_transform(cc_df_processed[['Amount']])
cc_df_processed['Time_scaled'] = scaler.fit_transform(cc_df_processed[['Time']])

# Prepare features
X_cc = cc_df_processed.drop(columns=['Class', 'Amount', 'Time'])
y_cc = cc_df_processed['Class']

print(f"Credit Card Features: {X_cc.shape}")
print(f"Columns: {X_cc.columns.tolist()}")


## 8. Save Processed Data


In [None]:
# Save processed datasets
# Uncomment to save
# fraud_df_encoded.to_csv('../data/processed/fraud_data_processed.csv', index=False)
# cc_df_processed.to_csv('../data/processed/creditcard_processed.csv', index=False)

print("Feature Engineering Complete!")
print("="*50)
print(f"""
Summary:
- E-commerce Fraud Data:
  * IP to Country mapping: {fraud_df_geo['country'].nunique()} countries
  * Time features: hour_of_day, day_of_week, time_since_signup
  * Velocity features: user_total_transactions, device_unique_users
  * Final features: {X_fraud.shape[1]}
  
- Credit Card Data:
  * Amount and Time scaled
  * V1-V28 PCA features preserved
  * Final features: {X_cc.shape[1]}
  
- SMOTE applied to training data to handle class imbalance
""")
