In [None]:
# ----------------------------------------
# Phase 5: Feature Engineering + Modeling Prep
# ----------------------------------------

import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load enriched fraud dataset from Phase 3 (with country and time features)
fraud_df = pd.read_csv("data/raw/fraud_data.csv")  # replace with actual merged file if you saved earlier

# Convert timestamps to datetime
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Create time difference (hours) between signup and purchase
fraud_df['time_diff'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 3600

# Extract hour of purchase and day of week
fraud_df['purchase_hour'] = fraud_df['purchase_time'].dt.hour
fraud_df['purchase_day'] = fraud_df['purchase_time'].dt.day_name()

# ----------------------------------------
# 1️⃣ Select meaningful columns
# ----------------------------------------
model_df = fraud_df[[
    'purchase_value',
    'device_id',
    'source',
    'browser',
    'sex',
    'age',
    'ip_address',
    'country',
    'time_diff',
    'purchase_hour',
    'purchase_day',
    'class'
]].copy()

# ----------------------------------------
# 2️⃣ Handle missing values
# ----------------------------------------
model_df['age'] = model_df['age'].fillna(model_df['age'].median())
model_df['country'] = model_df['country'].fillna("Unknown")
model_df.dropna(inplace=True)  # drop rows with any remaining missing

# ----------------------------------------
# 3️⃣ Feature Engineering
# ----------------------------------------

# A) Very fast signup → purchase
model_df['instant_purchase'] = (model_df['time_diff'] < 1).astype(int)

# B) Night transaction flag (23:00-05:00)
model_df['is_night'] = model_df['purchase_hour'].apply(lambda x: 1 if x >= 23 or x <= 5 else 0)

# C) High purchase value
median_purchase = model_df['purchase_value'].median()
model_df['high_value'] = (model_df['purchase_value'] > median_purchase).astype(int)

# D) Risk country flag (top 3 fraud countries)
risky_countries = model_df[model_df['class']==1]['country'].value_counts().head(3).index
model_df['risk_country'] = model_df['country'].isin(risky_countries).astype(int)

# ----------------------------------------
# 4️⃣ Encode categorical features
# ----------------------------------------

# Drop high cardinality device_id
model_df = model_df.drop(columns=['device_id', 'ip_address'])

# Categorical columns to one-hot encode
cat_cols = ['source', 'browser', 'sex', 'country', 'purchase_day']
model_df = pd.get_dummies(model_df, columns=cat_cols, drop_first=True)

# ----------------------------------------
# 5️⃣ Scale numeric features
# ----------------------------------------
num_cols = ['purchase_value', 'age', 'time_diff', 'purchase_hour']
scaler = StandardScaler()
model_df[num_cols] = scaler.fit_transform(model_df[num_cols])

# ----------------------------------------
# 6️⃣ Save processed dataset
# ----------------------------------------
model_df.to_csv("data/processed/fraud_data_processed.csv", index=False)
print("Processed fraud dataset saved to data/processed/fraud_data_processed.csv")

# ----------------------------------------
# 7️⃣ Optional: Quick train/test split for modeling prep
# ----------------------------------------
from sklearn.model_selection import train_test_split

X = model_df.drop(columns=['class'])
y = model_df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train/Test split complete.")
print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])


ModuleNotFoundError: No module named 'sklearn'