In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the data we saved in Part A
df = pd.read_csv('../data/processed/fraud_data_with_country.csv')

# Ensure datetimes are correct again (CSV saving sometimes reverts them to strings)
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])

# --- 1. Feature Engineering ---

# A. Time-based features
# Time difference between signup and purchase (in seconds)
# Logic: Fraudsters often automate accounts to buy instantly.
df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds()

# Hour of day and Day of week
df['hour_of_day'] = df['purchase_time'].dt.hour
df['day_of_week'] = df['purchase_time'].dt.dayofweek

# B. Frequency Features
# How many times has this device been used?
# Logic: A device used 100 times is likely a bot farm.
device_count = df.groupby('device_id')['user_id'].count().reset_index()
device_count.columns = ['device_id', 'device_freq']
df = pd.merge(df, device_count, on='device_id', how='left')

# How many times has this IP been used?
ip_count = df.groupby('ip_address')['user_id'].count().reset_index()
ip_count.columns = ['ip_address', 'ip_freq']
df = pd.merge(df, ip_count, on='ip_address', how='left')

# --- 2. Encoding Categorical Variables ---

# We need to turn 'source', 'browser', 'sex', and 'country' into numbers.
# We will use One-Hot Encoding (get_dummies)

# First, let's keep only relevant columns
features = df[['time_since_signup', 'hour_of_day', 'day_of_week', 
               'device_freq', 'ip_freq', 'purchase_value', 'age', 
               'source', 'browser', 'sex', 'country', 'class']]

# Apply One-Hot Encoding
# drop_first=True helps reduce redundancy (e.g., if not Male, must be Female)
df_encoded = pd.get_dummies(features, columns=['source', 'browser', 'sex', 'country'], drop_first=True)

# --- 3. Scaling / Normalization ---

# We need to scale 'purchase_value', 'age', 'time_since_signup', etc.
scaler = StandardScaler()

cols_to_scale = ['time_since_signup', 'purchase_value', 'age', 'device_freq', 'ip_freq']
df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

# --- 4. Final Save ---
# Save the final "Math-Ready" dataset
df_encoded.to_csv('../data/processed/fraud_data_final_processed.csv', index=False)

print("Fraud Data Feature Engineering Complete.")
print(f"Final Shape: {df_encoded.shape}")