In [None]:
# Phase 3: Adding new features to improve precision and recall from Phase 4

import pandas as pd
import numpy as np
from geopy.distance import geodesic

df = pd.read_pickle("../outputs/clean_data.pkl")

# Ensure datetime format
df['date'] = pd.to_datetime(df['date'])

# Amount to credit ratio
df['amount_to_credit_ratio'] = df['amount'] / (df['credit_limit'] + 1)

# Frequent Spending (tx count in last hour)
df = df.sort_values(['card_id', 'date'])
df['dummy'] = 1
rolling_tx = (
    df.set_index('date')
      .groupby('card_id')['dummy']
      .rolling('1h')
      .count()
      .rename('tx_count_last_hour')
)
df = df.merge(rolling_tx.reset_index(), on=['card_id', 'date'], how='left')
df.drop(columns='dummy', inplace=True)

# Ratio Z-score
ratio_stats = df.groupby('card_id')['amount_to_credit_ratio'].agg(['mean', 'std']).reset_index()
df = df.merge(ratio_stats, on='card_id', how='left', suffixes=('', '_user'))
df['ratio_zscore'] = (
    (df['amount_to_credit_ratio'] - df['mean']) / (df['std'] + 1e-6)
)

# Transaction hours
df['transaction_hour'] = df['date'].dt.hour

# Z-score for any outliers based on user ID
df['avg_amount_user'] = df.groupby('user_id')['amount'].transform('mean')
df['amount_zscore_user'] = (df['amount'] - df['avg_amount_user']) / df.groupby('user_id')['amount'].transform('std').replace(0, np.nan)
df['amount_zscore_user'] = np.where(np.isfinite(df['amount_zscore_user']), df['amount_zscore_user'], 0)

# Chip Usage Rate
df['chip_usage_rate'] = df.groupby('card_id')['use_chip'].transform('mean')
df['chip_deviation'] = np.abs(df['use_chip'] - df['chip_usage_rate'])

# Mean and std for each MCC
mcc_stats = df.groupby('mcc_code')['amount'].agg(['mean', 'std']).reset_index()
mcc_stats = mcc_stats.rename(columns={'mean': 'mcc_amount_mean', 'std': 'mcc_amount_std'})
df = df.merge(mcc_stats, on='mcc_code', how='left')
df['amount_zscore_mcc'] = (df['amount'] - df['mcc_amount_mean']) / (df['mcc_amount_std'] + 1e-6)

# Amount per hour average deviation
hour_avg = df.groupby('transaction_hour')['amount'].mean()
df['hour_avg_amount'] = df['transaction_hour'].map(hour_avg)
df['amount_vs_hour_avg'] = df['amount'] / (df['hour_avg_amount'] + 1e-6)

# Is low credit score
df['is_low_credit_score'] = (df['credit_score'] < 480).astype(int)

# Location Flag
df = df.sort_values(['card_id', 'date'])
df['prev_lat'] = df.groupby('card_id')['latitude'].shift(1)
df['prev_lon'] = df.groupby('card_id')['longitude'].shift(1)

df['geo_jump_km'] = df.apply(
    lambda row: geodesic(
        (row['latitude'], row['longitude']),
        (row['prev_lat'], row['prev_lon'])
    ).km if pd.notnull(row['prev_lat']) else 0, axis=1
)
df['geo_jump_flag'] = (df['geo_jump_km'] > 100).astype(int)

# Night transaction
df['is_night_transaction'] = df['transaction_hour'].between(0, 5).astype(int)


df.to_pickle("../outputs/refined_data.pkl")