# Feature Engineering & Preprocessing

> **Task 1 · Parts B–F**: Geolocation merge, time features, transaction velocity, scaling, and SMOTE.

In [None]:
import sys
sys.path.insert(0, '..')
import warnings; warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from src.eda.eda import load_fraud_data, load_ip_country, fix_dtypes, handle_missing_values, remove_duplicates, analyze_class_distribution
from src.eda.featuring.preprocess import merge_with_geolocation, scale_features, apply_smote
from src.eda.featuring.custom_feature import add_time_since_signup, add_time_features, add_transaction_velocity, build_feature_matrix

sns.set_theme(style='whitegrid')
print("Imports OK")

## 1. Load & Clean

In [None]:
fraud_df = load_fraud_data('../data/Fraud_Data.csv')
ip_df    = load_ip_country('../data/IpAddress_to_Country.csv')
fraud_df = fix_dtypes(fraud_df)
fraud_df = handle_missing_values(fraud_df)
fraud_df = remove_duplicates(fraud_df)
print(f"Shape: {fraud_df.shape}")

## 2. Geolocation Integration

Convert IP addresses to integers and merge with country ranges using `pd.merge_asof`.

In [None]:
from src.eda.featuring.preprocess import ip_to_int
# Demo: IP conversion
sample_ip = fraud_df['ip_address'].iloc[0]
print(f"Sample IP (float repr): {sample_ip}")
print(f"Converted to int:       {ip_to_int(sample_ip)}")
print()

fraud_geo = merge_with_geolocation(fraud_df, ip_df)
print(f"After geo-merge shape: {fraud_geo.shape}")
print(f"Unique countries: {fraud_geo['country'].nunique()}")
print(fraud_geo[['user_id','ip_address','country','class']].head())

## 3. Time-Based Features

In [None]:
df = add_time_since_signup(fraud_geo)
df = add_time_features(df)

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# time_since_signup by class
for cls, color, label in [(0,'steelblue','Legit'), (1,'crimson','Fraud')]:
    axes[0].hist(df[df['class']==cls]['time_since_signup']/3600, bins=50,
                 alpha=0.6, color=color, label=label, density=True)
axes[0].set_xlabel('Hours since signup')
axes[0].set_title('Time Since Signup by Class')
axes[0].legend()

# hour_of_day fraud rate
hourly = df.groupby('hour_of_day')['class'].mean()
hourly.plot(kind='bar', ax=axes[1], color='mediumpurple', edgecolor='black')
axes[1].set_title('Fraud Rate by Hour of Day')
axes[1].yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.1%}'))
axes[1].set_xlabel('Hour')

# day_of_week fraud rate
dow = df.groupby('day_of_week')['class'].mean()
dow.plot(kind='bar', ax=axes[2], color='coral', edgecolor='black')
axes[2].set_title('Fraud Rate by Day of Week')
axes[2].set_xticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], rotation=0)
axes[2].yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.1%}'))

plt.tight_layout()
plt.savefig('../models/plots/time_features.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Transaction Velocity

In [None]:
sample = fraud_geo.sample(n=5000, random_state=42).copy()
sample = add_time_since_signup(sample)
sample = add_time_features(sample)
sample = add_transaction_velocity(sample, window_hours=24)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for cls, color, label in [(0,'steelblue','Legit'), (1,'crimson','Fraud')]:
    axes[0].hist(sample[sample['class']==cls]['txn_count_24h'], bins=20,
                 alpha=0.7, color=color, label=label)
axes[0].set_xlabel('Transactions in past 24h')
axes[0].set_title('Transaction Velocity by Class')
axes[0].legend()

vel_rate = sample.groupby('txn_count_24h')['class'].mean().head(10)
vel_rate.plot(kind='bar', ax=axes[1], color='teal', edgecolor='black')
axes[1].set_title('Fraud Rate by Transaction Velocity')
axes[1].yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.1%}'))

plt.tight_layout()
plt.savefig('../models/plots/velocity.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Build Feature Matrix

In [None]:
X, y = build_feature_matrix(fraud_geo)
print(f"Feature matrix: {X.shape}")
print(f"Target:         {y.shape}")
print(f"\nFeature names (first 20):")
print(list(X.columns[:20]))

## 6. Train/Test Split & Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train: {X_train.shape}  |  Test: {X_test.shape}")
print(f"Train fraud rate: {y_train.mean():.3%}")
print(f"Test  fraud rate: {y_test.mean():.3%}")

X_train_sc, X_test_sc, scaler = scale_features(X_train, X_test)
print(f"\nTrain mean after scaling:  {X_train_sc.mean():.4f}")
print(f"Train std  after scaling:  {X_train_sc.std():.4f}")

## 7. Handle Class Imbalance — SMOTE

In [None]:
dist_before = analyze_class_distribution(pd.Series(y_train.values).to_frame('class').assign(**{'class':y_train.values}), 'class')
print(f"Before SMOTE: {dist_before['counts']}")

X_res, y_res = apply_smote(X_train_sc, y_train.values)

import collections
after = collections.Counter(y_res)
print(f"After SMOTE:  {dict(after)}")

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pd.Series(y_train.values).value_counts().plot(kind='bar', ax=axes[0],
    color=['steelblue','crimson'], edgecolor='black', title='Before SMOTE')
pd.Series(y_res).value_counts().plot(kind='bar', ax=axes[1],
    color=['steelblue','crimson'], edgecolor='black', title='After SMOTE')
for ax in axes:
    ax.set_xticklabels(['Legitimate','Fraud'], rotation=0)
    ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('../models/plots/smote_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ Preprocessing complete. Processed data already saved in data/processed/")