# Importing 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    recall_score,
    f1_score,
    precision_score,
    confusion_matrix,
    classification_report,
)
import shap
import re
import optuna
from scipy.stats import entropy
from collections import deque
import datetime as dt
from sklearn.model_selection import StratifiedGroupKFold

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Masking

import plotly.graph_objects as go
import seaborn as sns

# Data Exploration

In [None]:
# csv загрузка
transactions = pd.read_csv(
    "/Users/kabdulasset/Desktop/Hackaton/data/transactions.csv",
    sep=";",
    quotechar="'",
    encoding="cp1251",
    skiprows=1,
    dtype={"cst_dim_id": str}   
)
behavior = pd.read_csv(
    "/Users/kabdulasset/Desktop/Hackaton/data/behavior.csv",
    sep=";",            
    encoding="cp1251",   
    skiprows=1,
    dtype={"cst_dim_id": str}   
)

In [None]:
# удаление кавычек
def remove_quotes(x):
    if isinstance(x, str):
        return x.replace("'", "").strip()
    return x

behavior["transdate"] = behavior["transdate"].apply(remove_quotes)
transactions["transdate"] = transactions["transdate"].apply(remove_quotes)
transactions["transdatetime"] = transactions["transdatetime"].apply(remove_quotes)

# Преобразуем строки в datetime
behavior["transdate"] = pd.to_datetime(behavior["transdate"])
transactions["transdate"] = pd.to_datetime(transactions["transdate"])
transactions["transdatetime"] = pd.to_datetime(transactions["transdatetime"])

print("\nПропуски в behavioral")
print(behavior.isna().sum())

print("\nПропуски в transactions")
print(transactions.isna().sum())

print("\nРазмер таблиц:")
print("Поведенческие:", behavior.shape)
print("Транзакции:", transactions.shape)

## Behavior Data

In [None]:
behavior

In [None]:
behavior_rows_with_nan = behavior[behavior.isnull().any(axis=1)]
behavior_rows_with_nan

In [None]:
# список столбцов, в которых нужно заменить NaN на 0
columns_to_fill = ["freq_change_7d_vs_mean", "logins_7d_over_30d_ratio"]
behavior[columns_to_fill] = behavior[columns_to_fill].fillna(0)

In [None]:
behavior['cst_dim_id'].nunique()

## Transaction Data

In [None]:
transactions

In [None]:
# уникальные клиенты в транзакциях
transactions['cst_dim_id'].nunique()

In [None]:
# fraudulent транзакции
fraud_tr = transactions[transactions['target'] == 1]
fraud_tr

In [None]:
# уникальные клиенты c fraudulent транзакциями
transactions[transactions['target'] == 1]['cst_dim_id'].nunique()

In [None]:
frauds_per_client = (
    transactions[transactions['target'] == 1]
    .groupby('cst_dim_id')
    .size()
    .reset_index(name='fraud_count')
)
frauds_per_client

In [None]:
frauds_per_client[frauds_per_client['fraud_count']>1]

## Merged

In [None]:
merged = pd.merge(
    transactions,
    behavior,
    how="left",
    on=["transdate", "cst_dim_id"],
    indicator=True
)
merged

In [None]:
merged["_merge"].value_counts()

In [None]:
merged.columns

In [None]:
merged = merged[merged['_merge']=='both']

merged = merged.drop(columns = '_merge')

## Keep only the first fraud transaction and drop the rest

In [None]:
merged = merged.sort_values(["cst_dim_id", "transdatetime"]).reset_index(drop=True)

In [None]:
merged

In [None]:
first_fraud_idx = (
    merged[merged["target"] == 1]
    .groupby("cst_dim_id")["transdatetime"]
    .idxmin()
)

In [None]:
cleaned_data = merged[
    (merged["target"] == 0) |
    (merged.index.isin(first_fraud_idx))
]

In [None]:
print("Before:", len(merged))
print("After:", len(cleaned_data))
print("Dropped:", len(merged) - len(cleaned_data))


# Data Cleaning

In [None]:
cleaned_data = cleaned_data[cleaned_data['amount']>0]

In [None]:
cleaned_data = cleaned_data[cleaned_data['cst_dim_id'].notna()]

In [None]:
cleaned_data.isna().sum()


In [None]:
# 1) Categorical columns → "Unknown"
cleaned_data["last_phone_model_categorical"] = (
    cleaned_data["last_phone_model_categorical"].fillna("Unknown")
)

cleaned_data["last_os_categorical"] = (
    cleaned_data["last_os_categorical"].fillna("Unknown")
)

In [None]:
cleaned_data

## Calculating values for 11 фев

In [None]:
cleaned_data["login_frequency_30d"] = pd.to_numeric(
    cleaned_data["login_frequency_30d"],
    errors="coerce"     # make non-numeric values become NaN
)

# Replace NaN values with computed value
cleaned_data["login_frequency_30d"] = cleaned_data["login_frequency_30d"].fillna(
    cleaned_data["logins_last_30_days"] / 30
)

In [None]:
cleaned_data["freq_change_7d_vs_mean"] = pd.to_numeric(
    cleaned_data["freq_change_7d_vs_mean"],
    errors="coerce"     # make non-numeric values become NaN
)

# Replace NaN values with computed value
cleaned_data["freq_change_7d_vs_mean"] = cleaned_data["freq_change_7d_vs_mean"].fillna(
    (cleaned_data["login_frequency_7d"] - cleaned_data["login_frequency_30d"])/cleaned_data["login_frequency_30d"])

In [None]:
# Step 1: replace comma decimal separators with dot
cleaned_data['var_login_interval_30d'] = (
    cleaned_data['var_login_interval_30d']
    .astype(str)                   # ensure strings
    .str.replace(',', '.', regex=False)  # convert 1,15E+11 → 1.15E+11
)

# Step 2: convert to numeric
cleaned_data['var_login_interval_30d'] = pd.to_numeric(
    cleaned_data['var_login_interval_30d'],
    errors='coerce'
)

In [None]:
cleaned_data.isna().sum()

## OS Cleaning and Grouping

In [None]:
cleaned_data["last_os_categorical"] = (
    cleaned_data["last_os_categorical"].fillna("Unknown")
)

def clean_os_category(x):
    if isinstance(x, str):
        if x.startswith("Android/"):
            return x
        if x.startswith("iOS/"):
            return x
    return "Unknown"

cleaned_data["last_os_categorical"] = cleaned_data["last_os_categorical"].apply(clean_os_category)


def group_os(os_string):
    if pd.isna(os_string):
        return "Unknown"

    # Normalize input
    s = str(os_string).strip()

    # Handle Unknown
    if s.lower() == "unknown":
        return "Unknown"

    # Split into platform + version
    parts = s.split('/')
    if len(parts) < 2:
        return "Unknown"

    platform = parts[0]  # "iOS" or "Android"
    version_raw = parts[1]  # e.g. "18.6.1" or "14"

    # Extract major version
    major = version_raw.split('.')[0]

    # Clean edge cases (e.g., "26.0" for iOS)
    try:
        major_int = int(major)
    except:
        return "Unknown"

    return f"{platform}_{major_int}"


cleaned_data["last_os_categorical"] = cleaned_data["last_os_categorical"].apply(group_os)

## Phone Models Cleaning and Grouping

In [None]:
# Categorical columns → "Unknown"
cleaned_data["last_phone_model_categorical"] = (
    cleaned_data["last_phone_model_categorical"].fillna("Unknown")
)

def normalize_device_string(s):
    if pd.isna(s):
        return s

    s = str(s).strip()

    # 1. Remove known junk prefixes like "implyForteApp 1.0 "
    s = re.sub(r'^implyForteApp\s*\d+(\.\d+)?\s*', '', s, flags=re.IGNORECASE)

    # 2. Lowercase for uniformity
    s = s.lower()

    # 3. Replace commas in iPhone versions (iphone14,2 → iphone14_2)
    s = s.replace(",", "_")

    # 4. Replace hyphens and spaces with underscores
    s = re.sub(r'[\s\-]+', '_', s)

    # 5. Remove multiple underscores
    s = re.sub(r'_+', '_', s)

    # 6. Strip leading/trailing underscores
    s = s.strip('_')

    return s


# Apply to your columns
cleaned_data["last_phone_model_categorical"] = cleaned_data["last_phone_model_categorical"].apply(normalize_device_string)

mask = cleaned_data["last_phone_model_categorical"] == "x86_64"

cleaned_data.loc[mask, "last_phone_model_categorical"] = "Other"

cleaned_data.loc[mask, "last_os_categorical"] = "Other"


def get_brand(model):
    m = model.lower()

    if m.startswith("iphone"):
        return "Apple"
    if m.startswith("samsung"):
        return "Samsung"
    if m.startswith("xiaomi") or m.startswith("redmi") or m.startswith("poco"):
        return "Xiaomi"
    if m.startswith("oppo"):
        return "Oppo"
    if m.startswith("vivo"):
        return "Vivo"
    if m.startswith("huawei") or m.startswith("honor"):
        return "Huawei_Honor"
    if m.startswith("realme"):
        return "Realme"
    if m.startswith("tecno"):
        return "Tecno"
    if m.startswith("google") or "pixel" in m:
        return "Google"
    if m.startswith("motorola") or "moto" in m:
        return "Motorola"
    if m.startswith("meizu"):
        return "Meizu"
    if m in ["unknown", "none", "null"]:
        return "Unknown"
    if m in ["x86_64", "amd64", "arm64-v8a"]:
        return "Other"

    return "Other"


def get_iphone_family(model):
    # iphone14_5 → iphone, version = 14
    match = re.match(r"iphone(\d+)", model.lower())
    if match:
        return f"iPhone_{match.group(1)}"
    return "iPhone_Other"

def get_samsung_family(model):
    m = model.lower()
    if "sm_s" in m:
        return "Samsung_S"
    if "sm_a" in m:
        return "Samsung_A"
    if "sm_j" in m:
        return "Samsung_J"
    if "sm_f7" in m:
        return "Samsung_Z_Flip"
    if "sm_f9" in m:
        return "Samsung_Z_Fold"
    if "sm_g" in m:
        return "Samsung_G"
    return "Samsung_Other"


def get_xiaomi_family(model):
    m = model.lower()
    if "redmi" in m:
        return "Xiaomi_Redmi"
    if "poco" in m:
        return "Xiaomi_Poco"
    # Mi series detection extended:
    if re.search(r"m2\d{2,}", m):  # m2101k7ag, m2006c3lg, etc
        return "Xiaomi_Mi"
    if re.search(r"m\d{2,}", m):
        return "Xiaomi_Mi"
    return "Xiaomi_Other"


def get_oppo_family(model):
    if "cph" in model.lower():
        return "Oppo_CPH"
    return "Oppo_Other"

def get_vivo_family(model):
    m = model.lower()
    if re.search(r"v\d{3,4}", m):
        return "Vivo_V"
    if "vivo" in m:
        return "Vivo_V"
    return "Vivo_Other"

def get_huawei_family(model):
    return "Huawei_Honor"


def group_phone_model(model):
    if pd.isna(model):
        return "Unknown"

    model = model.lower().strip()

    brand = get_brand(model)

    if brand == "Apple":
        return get_iphone_family(model)
    if brand == "Samsung":
        return get_samsung_family(model)
    if brand == "Xiaomi":
        return get_xiaomi_family(model)
    if brand == "Oppo":
        return get_oppo_family(model)
    if brand == "Vivo":
        return get_vivo_family(model)
    if brand == "Huawei_Honor":
        return get_huawei_family(model)
    if brand in ["Realme", "Tecno", "Motorola", "Meizu", "Google"]:
        return brand
    if brand in ["Unknown", "Other"]:
        return brand

    return "Other"

In [None]:
cleaned_data["last_phone_model_categorical"].value_counts()

In [None]:
cleaned_data["last_phone_model_categorical"] = cleaned_data["last_phone_model_categorical"].apply(group_phone_model)

In [None]:
cleaned_data

# Feature Engineering

In [None]:
cleaned_data.columns

## Time Features


In [None]:
cleaned_data['transdatetime'] = pd.to_datetime(cleaned_data['transdatetime'])

In [None]:
cleaned_data["hour"] = cleaned_data["transdatetime"].dt.hour
cleaned_data["day_of_week"] = cleaned_data["transdatetime"].dt.weekday
cleaned_data["is_weekend"] = cleaned_data["day_of_week"].isin([5, 6]).astype(int)

## Direction / Recipient Features

In [None]:
#    1 = this direction did NOT exist before for this customer
#    0 = this direction already existed in the customer's past
cleaned_data["is_new_direction"] = (
    cleaned_data
    .groupby("cst_dim_id")["direction"]
    .transform(lambda s: (~s.duplicated()).astype(int))
)

In [None]:
# Count how often the client used each direction BEFORE the current transaction
cleaned_data = cleaned_data.sort_values(["cst_dim_id", "transdatetime"]).copy()

cleaned_data["direction_count_before"] = (
    cleaned_data
        .groupby(["cst_dim_id", "direction"])
        .cumcount()
)

In [None]:
# "frequent direction" (>= 2 previous transactions)
cleaned_data["is_frequent_direction"] = (cleaned_data["direction_count_before"] >= 2).astype(int)

In [None]:
cleaned_data["had_previous_transactions"] = (cleaned_data["direction_count_before"] >= 1).astype(int)

In [None]:
# How many different recipients the client has used
# High = unusual, possibly fraud
cleaned_data = cleaned_data.sort_values(["cst_dim_id", "transdatetime"]).copy()

def compute_unique_dirs(group):
    seen = set()
    unique_counts = []

    for idx, row in group.iterrows():
        unique_counts.append(len(seen))  # BEFORE this tx
        seen.add(row["direction"])       # update AFTER

    group["num_unique_directions"] = unique_counts
    return group

result = (
    cleaned_data
    .groupby("cst_dim_id")
    .apply(compute_unique_dirs)
    .reset_index(level=0, drop=True)
)

cleaned_data["num_unique_directions"] = result["num_unique_directions"]

## Client-Level Historical Features

In [None]:
cleaned_data = cleaned_data.sort_values(["cst_dim_id", "transdatetime"]).copy()

# Shifted amount (so each row sees only previous transactions) ----
shifted_amount = cleaned_data.groupby("cst_dim_id")["amount"].shift(1)

# Expanding mean/std computed ONLY on shifted values ----
expanding_stats = (
    shifted_amount
        .groupby(cleaned_data["cst_dim_id"])
        .expanding()
        .agg(["mean", "std"])
)

# Remove hierarchical index
expanding_stats = expanding_stats.reset_index(level=0, drop=True)

cleaned_data["amount_mean_before"] = expanding_stats["mean"].fillna(0)
cleaned_data["amount_std_before"]  = expanding_stats["std"].fillna(0)
cleaned_data["amount_var_before"]  = cleaned_data["amount_std_before"] ** 2

# Z-score ----
cleaned_data["amount_zscore"] = (
    (cleaned_data["amount"] - cleaned_data["amount_mean_before"]) /
    (cleaned_data["amount_std_before"] + 1e-9)
)

cleaned_data["amount_zscore"] = cleaned_data["amount_zscore"].fillna(0)

# Same amount sent before ----
cleaned_data["same_amount_sent_before"] = (
    cleaned_data.groupby(["cst_dim_id", "amount"])
    .cumcount()
    .gt(0)
    .astype(int)
)

In [None]:
cleaned_data['time_since_last_tx'] = (
    cleaned_data.groupby('cst_dim_id')['transdatetime']
    .diff()
    .dt.total_seconds()
    .fillna(0)  # 0 for first transaction
)
# Optional: Log-transform for skewed distribution
cleaned_data['log_time_since_last_tx'] = np.log1p(cleaned_data['time_since_last_tx'])

In [None]:
cleaned_data['amount_max_before'] = (
    cleaned_data.groupby('cst_dim_id')['amount']
    .expanding()
    .max()
    .shift(1)
    .reset_index(level=0, drop=True)
    .fillna(0)
)
cleaned_data['amount_min_before'] = (
    cleaned_data.groupby('cst_dim_id')['amount']
    .expanding()
    .min()
    .shift(1)
    .reset_index(level=0, drop=True)
    .fillna(cleaned_data['amount'])  # Use current for first tx
)
cleaned_data['amount_ratio_to_max'] = cleaned_data['amount'] / (cleaned_data['amount_max_before'] + 1e-9)
cleaned_data['amount_ratio_to_min'] = cleaned_data['amount'] / (cleaned_data['amount_min_before'] + 1e-9)

In [None]:
# Ratio of recent logins to transaction amount (high logins + low amount might be testing)
cleaned_data['logins_to_amount_ratio'] = cleaned_data['logins_last_7_days'] / (cleaned_data['amount'] + 1e-9)

# Z-score of login frequency relative to amount z-score
cleaned_data['login_amount_interaction_z'] = cleaned_data['zscore_avg_login_interval_7d'] * cleaned_data['amount_zscore']

In [None]:
# Cumulative sum of amounts per customer BEFORE current tx
cleaned_data['cum_amount_before'] = (
    cleaned_data.groupby('cst_dim_id')['amount']
    .expanding()
    .sum()
    .shift(1)
    .reset_index(level=0, drop=True)
    .fillna(0)
)

# Velocity: Cumulative amount / days since first tx
first_tx_date = cleaned_data.groupby('cst_dim_id')['transdatetime'].min().reset_index(name='first_tx')
cleaned_data = cleaned_data.merge(first_tx_date, on='cst_dim_id')
cleaned_data['days_since_first'] = (cleaned_data['transdatetime'] - cleaned_data['first_tx']).dt.days + 1  # Avoid div by 0
cleaned_data['amount_velocity'] = cleaned_data['cum_amount_before'] / cleaned_data['days_since_first']
cleaned_data = cleaned_data.drop(columns=['first_tx'])  # Cleanup

In [None]:
def direction_entropy(group):
    seen_dirs = []
    entropies = []
    for dir in group['direction']:
        seen_dirs.append(dir)
        dir_counts = pd.Series(seen_dirs).value_counts(normalize=True)
        entropies.append(entropy(dir_counts))
    group['direction_entropy_before'] = [0] + entropies[:-1]  # Shift to BEFORE current
    return group

cleaned_data = cleaned_data.groupby('cst_dim_id').apply(direction_entropy).reset_index(drop=True)

In [None]:
# Must be sorted
cleaned_data = cleaned_data.sort_values(["cst_dim_id", "transdatetime"]).copy()

# Preallocate columns
cleaned_data["tx_count_last_24h"] = 0
cleaned_data["tx_count_last_7d"] = 0

# Sliding-window per user
for cust_id, group_idx in cleaned_data.groupby("cst_dim_id").groups.items():
    idx_list = list(group_idx)

    window_24h = deque()
    window_7d = deque()

    for pos, idx in enumerate(idx_list):
        t = cleaned_data.at[idx, "transdatetime"]

        # Remove outdated from 24h window
        while window_24h and t - cleaned_data.at[window_24h[0], "transdatetime"] > pd.Timedelta("1D"):
            window_24h.popleft()

        # Remove outdated from 7d window
        while window_7d and t - cleaned_data.at[window_7d[0], "transdatetime"] > pd.Timedelta("7D"):
            window_7d.popleft()

        # Count BEFORE current transaction
        cleaned_data.at[idx, "tx_count_last_24h"] = len(window_24h)
        cleaned_data.at[idx, "tx_count_last_7d"]  = len(window_7d)

        # Add current to windows AFTER counting
        window_24h.append(idx)
        window_7d.append(idx)


# Clean / Finalize Dataset

In [None]:
clients = cleaned_data["cst_dim_id"].unique()
train_clients, test_clients = train_test_split(
    clients, test_size=0.2, random_state=40
)

train = cleaned_data[cleaned_data.cst_dim_id.isin(train_clients)]
test  = cleaned_data[cleaned_data.cst_dim_id.isin(test_clients)]

In [None]:
train[train['target']==1]

In [None]:
train['cst_dim_id'].nunique()

In [None]:
y_train = train["target"]
y_test = test["target"]

# LSTM

In [None]:
# -----------------------------
# CONFIG
# -----------------------------
RANDOM_STATE = 40
TIME_STEPS   = 10  # look-back window size
LATENT_DIM   = 16
LSTM_UNITS   = 32
EPOCHS       = 20
BATCH_SIZE   = 64

# ======================================================================
# 1. Use only outer TRAIN data to train the autoencoder
# ======================================================================
ready_data = (
    train
    .reset_index(drop=True)
    .sort_values(["cst_dim_id", "transdatetime"])
)

# ======================================================================
# 2. Identify client groups in TRAIN
# ======================================================================
all_clients   = ready_data["cst_dim_id"].unique()
fraud_clients = ready_data.loc[ready_data['target'] == 1, 'cst_dim_id'].unique()

# Clients that never had fraud in TRAIN
normal_only_clients = np.setdiff1d(all_clients, fraud_clients)

print(f"Total clients in TRAIN: {len(all_clients)}")
print(f"Clients with fraud in TRAIN: {len(fraud_clients)}")
print(f"Normal-only clients in TRAIN: {len(normal_only_clients)}")

# ======================================================================
# 3. Split TRAIN data into:
#    - lstm_train_normals_df: normal-only clients (for training AE)
#    - lstm_train_fraud_df: clients with any fraud (for scoring only)
# ======================================================================
lstm_train_normals_df = (
    ready_data
    .loc[ready_data.cst_dim_id.isin(normal_only_clients)]
    .copy()
    .reset_index(drop=True)
)

lstm_train_fraud_df = (
    ready_data
    .loc[ready_data.cst_dim_id.isin(fraud_clients)]
    .copy()
    .reset_index(drop=True)
)

print(f"LSTM training rows (normal-only clients): {len(lstm_train_normals_df)}")
print(f"LSTM side rows (clients with fraud in TRAIN): {len(lstm_train_fraud_df)}")

# ======================================================================
# 4. Feature definition + preprocessing (numeric + OHE + scaling)
# ======================================================================

# Columns that should NOT be used as features
DROP_COLS = ["cst_dim_id", "transdate", "transdatetime", "docno", "direction", "target"]
CAT_COLS  = ["last_phone_model_categorical", "last_os_categorical"]

FEATURE_EXCLUDE = set(DROP_COLS + CAT_COLS)

# Numeric columns: all else except DROP_COLS and CAT_COLS
NUM_COLS = [c for c in lstm_train_normals_df.columns if c not in FEATURE_EXCLUDE]

print("Numeric feature columns:", NUM_COLS)
print("Categorical (to OHE) columns:", CAT_COLS)

# We'll also need a copy of TEST for feature processing
test_df_for_features = test.copy()

# 4.1. Numeric: enforce numeric type & impute NaNs
for col in NUM_COLS:
    lstm_train_normals_df[col] = pd.to_numeric(lstm_train_normals_df[col], errors='coerce').fillna(0)
    lstm_train_fraud_df[col]   = pd.to_numeric(lstm_train_fraud_df[col],   errors='coerce').fillna(0)
    test_df_for_features[col]  = pd.to_numeric(test_df_for_features[col],  errors='coerce').fillna(0)

# 4.2. One-Hot Encoding on categorical features
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_norm_cat = ohe.fit_transform(lstm_train_normals_df[CAT_COLS])
X_train_fraud_cat = ohe.transform(lstm_train_fraud_df[CAT_COLS])
X_test_cat = ohe.transform(test_df_for_features[CAT_COLS])

ohe_cols = ohe.get_feature_names_out(CAT_COLS)

X_train_norm_cat_df = pd.DataFrame(X_train_norm_cat,  columns=ohe_cols, index=lstm_train_normals_df.index)
X_train_fraud_cat_df = pd.DataFrame(X_train_fraud_cat, columns=ohe_cols, index=lstm_train_fraud_df.index)
X_test_cat_df  = pd.DataFrame(X_test_cat,            columns=ohe_cols, index=test_df_for_features.index)

# 4.3. Scale numeric columns only (MinMaxScaler)
scaler = MinMaxScaler()

X_train_norm_num_scaled  = scaler.fit_transform(lstm_train_normals_df[NUM_COLS])
X_train_fraud_num_scaled = scaler.transform(lstm_train_fraud_df[NUM_COLS])
X_test_num_scaled        = scaler.transform(test_df_for_features[NUM_COLS])

X_train_norm_num_df  = pd.DataFrame(X_train_norm_num_scaled,  columns=NUM_COLS, index=lstm_train_normals_df.index)
X_train_fraud_num_df = pd.DataFrame(X_train_fraud_num_scaled, columns=NUM_COLS, index=lstm_train_fraud_df.index)
X_test_num_df        = pd.DataFrame(X_test_num_scaled,        columns=NUM_COLS, index=test_df_for_features.index)

# 4.4. Final processed feature matrices
X_train_norm_processed  = pd.concat([X_train_norm_num_df,  X_train_norm_cat_df],  axis=1)
X_train_fraud_processed = pd.concat([X_train_fraud_num_df, X_train_fraud_cat_df], axis=1)
X_test_processed        = pd.concat([X_test_num_df,        X_test_cat_df],        axis=1)

N_FEATURES = X_train_norm_processed.shape[1]
print(f"Total features (N_FEATURES) for LSTM input: {N_FEATURES}")

# ======================================================================
# 5. Sequence creation (per client, padded to TIME_STEPS)
# ======================================================================
def create_sequences(X_df, df_original, time_steps):
    """
    Creates sequences per client (sliding window up to 'time_steps'),
    pads from the front with zeros if shorter than time_steps.

    Returns:
        X_seq: np.array of shape (N_sequences, time_steps, N_features)
        indices: list of original row indices corresponding to the last step
    """
    X_seq = []
    indices = []

    df_scaled = X_df.copy()
    n_features = df_scaled.shape[1]

    for cst_id, group in df_original.groupby('cst_dim_id'):
        client_indices = group.index
        client_data = df_scaled.loc[client_indices].values
        n_transactions = len(client_data)

        for i in range(n_transactions):
            start_index = max(0, i - time_steps + 1)
            sequence = client_data[start_index:i+1]

            # Pad with zeros if shorter than time_steps
            if len(sequence) < time_steps:
                padding_needed = time_steps - len(sequence)
                padding = np.zeros((padding_needed, n_features))
                sequence = np.vstack((padding, sequence))

            X_seq.append(sequence)
            indices.append(client_indices[i])

    return np.array(X_seq), indices

# ======================================================================
# 6. Build sequences for:
#    - normal-only TRAIN (for training the autoencoder)
#    - fraud-client TRAIN (for scoring)
#    - full TEST (for scoring)
# ======================================================================
X_train_norm_seq,  train_norm_indices  = create_sequences(X_train_norm_processed,  lstm_train_normals_df, TIME_STEPS)
X_train_fraud_seq, train_fraud_indices = create_sequences(X_train_fraud_processed, lstm_train_fraud_df,   TIME_STEPS)
X_test_seq,        test_indices        = create_sequences(X_test_processed,        test,                  TIME_STEPS)

y_train_norm_seq = X_train_norm_seq

print("\nSequence shapes:")
print(f"TRAIN normal sequences: {X_train_norm_seq.shape}")
print(f"TRAIN fraud sequences:  {X_train_fraud_seq.shape}")
print(f"TEST sequences:         {X_test_seq.shape}")

# ======================================================================
# 7. Define & train the LSTM Autoencoder (on normal-only sequences)
# ======================================================================
model = Sequential([
    Masking(mask_value=0.0, input_shape=(TIME_STEPS, N_FEATURES)),

    # Encoder
    LSTM(LSTM_UNITS, activation='relu', return_sequences=True),
    LSTM(LATENT_DIM, activation='relu', return_sequences=False),

    # Repeat latent vector
    RepeatVector(TIME_STEPS),

    # Decoder
    LSTM(LATENT_DIM, activation='relu', return_sequences=True),
    LSTM(LSTM_UNITS, activation='relu', return_sequences=True),

    # Output: reconstruct features at each time step
    TimeDistributed(Dense(N_FEATURES))
])

model.compile(optimizer='adam', loss='mae')

print("\nStarting AE training on normal-only clients...")

history = model.fit(
    X_train_norm_seq, y_train_norm_seq,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    shuffle=False,   # keep temporal ordering
    verbose=1
)

print("AE training finished.")

# ======================================================================
# 8. Use trained AE to reconstruct & compute anomaly scores
# ======================================================================
X_train_norm_pred  = model.predict(X_train_norm_seq,  verbose=0)
X_train_fraud_pred = model.predict(X_train_fraud_seq, verbose=0)
X_test_pred        = model.predict(X_test_seq,        verbose=0)

def calculate_anomaly_scores(X_true_seq, X_pred_seq, indices, original_df):
    """
    Calculates MAE on the last time step and merges back to original_df.
    """
    X_true_last = X_true_seq[:, TIME_STEPS - 1, :]
    X_pred_last = X_pred_seq[:, TIME_STEPS - 1, :]

    mae = np.mean(np.abs(X_true_last - X_pred_last), axis=1)

    scores_df = pd.DataFrame({'anomaly_score_lstm': mae}, index=indices)

    final_df = original_df.merge(
        scores_df,
        left_index=True,
        right_index=True,
        how='left'
    )
    return final_df

train_norm_with_scores  = calculate_anomaly_scores(X_train_norm_seq,  X_train_norm_pred,  train_norm_indices,  lstm_train_normals_df)
train_fraud_with_scores = calculate_anomaly_scores(X_train_fraud_seq, X_train_fraud_pred, train_fraud_indices, lstm_train_fraud_df)
test_with_scores        = calculate_anomaly_scores(X_test_seq,        X_test_pred,        test_indices,        test)

# Combine normal+fraud TRAIN
full_train_with_scores = pd.concat(
    [train_norm_with_scores, train_fraud_with_scores],
    axis=0,
    ignore_index=True
)

print("\n--- Anomaly score shapes ---")
print("Normal TRAIN with scores:", train_norm_with_scores.shape)
print("Fraud TRAIN with scores:",  train_fraud_with_scores.shape)
print("FULL TRAIN with scores:",   full_train_with_scores.shape)
print("TEST with scores:",         test_with_scores.shape)

print("\nExample TEST rows with anomaly score:")
print(test_with_scores[['cst_dim_id', 'transdatetime', 'amount', 'target', 'anomaly_score_lstm']].head())

# ======================================================================
# 9. Quick sanity check: distributions
# ======================================================================
plt.figure()
full_train_with_scores['anomaly_score_lstm'].hist(bins=50)
plt.title("FULL TRAIN anomaly_score_lstm")

plt.figure()
test_with_scores['anomaly_score_lstm'].hist(bins=50)
plt.title("TEST anomaly_score_lstm")

fraud_test_dist = test_with_scores[test_with_scores['target'] == 1]
plt.figure()
fraud_test_dist['anomaly_score_lstm'].hist(bins=50)
plt.title("Fraud (TEST) anomaly_score_lstm")

print("\nMeans:")
print("FULL TRAIN mean:", full_train_with_scores['anomaly_score_lstm'].mean())
print("TEST mean:",       test_with_scores['anomaly_score_lstm'].mean())

print("\nMedians:")
print("FULL TRAIN median:", full_train_with_scores['anomaly_score_lstm'].median())
print("TEST median:",       test_with_scores['anomaly_score_lstm'].median())

# ======================================================================
# 10. Supervised model: build TRAIN/VAL from TRAIN only
# ======================================================================

# Define which columns to drop as features for supervised model
SUPER_DROP_COLS = ["cst_dim_id", "target", "transdate", "transdatetime", "docno", "direction"]

X_train_final = full_train_with_scores.drop(columns=[c for c in SUPER_DROP_COLS if c in full_train_with_scores.columns])
y_train_final = full_train_with_scores["target"]

X_test_final = test_with_scores.drop(columns=[c for c in SUPER_DROP_COLS if c in test_with_scores.columns])
y_test_final = test_with_scores["target"]

print("\nSupervised feature shapes BEFORE split:")
print("X_train_final:", X_train_final.shape)
print("y_train_final:", y_train_final.shape)
print("X_test_final:",  X_test_final.shape)
print("y_test_final:",  y_test_final.shape)


## One Hot Encoding

In [None]:
# 0) Clean numeric columns BEFORE encoding
cat_cols = ["last_phone_model_categorical", "last_os_categorical"]
num_cols = [c for c in X_train_final.columns if c not in cat_cols]

# Convert to numeric (non-numeric -> NaN)
X_train_final[num_cols] = X_train_final[num_cols].apply(
    lambda col: pd.to_numeric(col, errors="coerce")
)
X_test_final[num_cols] = X_test_final[num_cols].apply(
    lambda col: pd.to_numeric(col, errors="coerce")
)

In [None]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_ohe = ohe.fit_transform(X_train_final[cat_cols])
X_test_ohe  = ohe.transform(X_test_final[cat_cols])

ohe_cols = ohe.get_feature_names_out(cat_cols)

X_train_ohe = pd.DataFrame(X_train_ohe, columns=ohe_cols, index=X_train_final.index)
X_test_ohe  = pd.DataFrame(X_test_ohe,  columns=ohe_cols, index=X_test_final.index)

In [None]:
X_train_final = pd.concat([X_train_final[num_cols], X_train_ohe], axis=1)
X_test_final  = pd.concat([X_test_final[num_cols],  X_test_ohe],  axis=1)

In [None]:
X_train_final

# Model Training

In [None]:
neg = (y_train_final == 0).sum()
pos = (y_train_final == 1).sum()
scale_pos_weight = neg / max(pos, 1)

In [None]:
X = X_train_final.reset_index(drop=True)
y = y_train_final.reset_index(drop=True)
groups = train["cst_dim_id"].reset_index(drop=True)

cv = StratifiedGroupKFold(
    n_splits=5,
    shuffle=True,
    random_state=42,
)

# Take the *first* fold as your train/validation split
for tr_idx, val_idx in cv.split(X, y, groups):
    X_tr  = X.iloc[tr_idx]
    y_tr  = y.iloc[tr_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    break  # only use the first split

print("\nAfter StratifiedGroupKFold split:")
print("X_tr:", X_tr.shape, "  y_tr:", y_tr.shape)
print("X_val:", X_val.shape, " y_val:", y_val.shape)


In [None]:
X_train_final

## Optuna Hyperparameter tuning

In [None]:
def objective(trial: optuna.trial.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 15.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),

        "tree_method": "hist",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "n_jobs": -1,
        "random_state": 40,
    }

    fold_aucs = []

    for train_idx, val_idx in cv.split(X, y, groups):
        # IMPORTANT: use .iloc for row selection on DataFrames/Series
        X_tr = X.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_tr = y.iloc[train_idx]
        y_val = y.iloc[val_idx]

        # Handle class imbalance per fold
        neg = (y_tr == 0).sum()
        pos = (y_tr == 1).sum()
        params["scale_pos_weight"] = neg / max(pos, 1)

        model = XGBClassifier(**params)

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False,
        )

        y_val_proba = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_val_proba)
        fold_aucs.append(auc)

    return float(np.mean(fold_aucs))

In [None]:
db_path = "/Users/kabdulasset/Desktop/Hackaton/fraud_optuna.db" 
storage_url = f"sqlite:///{db_path}"

print("Using storage:", storage_url)

study = optuna.create_study(
    study_name="fraud_xgb_study",
    direction="maximize",
    storage=storage_url,
    load_if_exists=True,
)

study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Best AUC:", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

## Choosing threshold from PR curve from Cross Validation

In [None]:
full_train_with_scores = full_train_with_scores.reset_index(drop=True)
X_train_final          = X_train_final.reset_index(drop=True)
y_train_final          = y_train_final.reset_index(drop=True)

X = X_train_final
y = y_train_final
groups = full_train_with_scores["cst_dim_id"].reset_index(drop=True)

best_params = study.best_params.copy()

# Add fixed params
best_params.update({
    "tree_method": "hist",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "n_jobs": -1,
    "random_state": 50,
})
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof_pred = np.zeros(len(X))

for train_idx, val_idx in cv.split(X, y, groups):
    X_tr = X.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_tr = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    # scale_pos_weight per fold
    neg = (y_tr == 0).sum()
    pos = (y_tr == 1).sum()
    best_params["scale_pos_weight"] = neg / max(pos, 1)

    model = XGBClassifier(**best_params)
    model.fit(X_tr, y_tr, verbose=False)

    oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]

In [None]:
# 1) PR curve from OOF predictions
prec_cv, rec_cv, thr_cv = precision_recall_curve(y, oof_pred)

prec_thr = prec_cv[1:]  # align with thresholds
rec_thr  = rec_cv[1:]

f1_thr = 2 * prec_thr * rec_thr / (prec_thr + rec_thr + 1e-9)

pr_table = pd.DataFrame({
    "threshold": thr_cv,
    "precision": prec_thr,
    "recall": rec_thr,
    "f1": f1_thr
})

# Optional: sort by threshold (ascending)
pr_table = pr_table.sort_values("threshold").reset_index(drop=True)

# Sort by F1 (highest first)
pr_table_top50 = pr_table.sort_values("f1", ascending=False).head(60)

print(pr_table_top50)

In [None]:
# Plot of PR Curve
plt.figure(figsize=(7, 6))

scatter = plt.scatter(
    rec_thr,
    prec_thr,
    c=thr_cv,
    s=10,
    cmap="viridis"
)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve (colored by threshold)")
cbar = plt.colorbar(scatter)
cbar.set_label("Threshold")

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
chosen_threshold = 0.679180 # <- we choose mannually

## Final XGBoost Training

In [None]:
# Recompute imbalance on full training data
neg_full = (y_train_final == 0).sum()
pos_full = (y_train_final == 1).sum()

final_params = best_params.copy()
final_params["scale_pos_weight"] = neg_full / max(pos_full, 1)

best_xgb = XGBClassifier(**final_params)
best_xgb.fit(X_train_final, y_train_final)

In [None]:
# Predict probabilities on test set
y_proba_test = best_xgb.predict_proba(X_test_final)[:, 1]

# Threshold-free metrics
roc_auc = roc_auc_score(y_test_final, y_proba_test)
pr_auc = average_precision_score(y_test, y_proba_test)
print(f"Test ROC-AUC: {roc_auc:.4f}")
print(f"Test PR-AUC:  {pr_auc:.4f}")

# Apply your manually chosen threshold
y_pred_test = (y_proba_test >= chosen_threshold).astype(int)

print("\nConfusion matrix (test):")
print(confusion_matrix(y_test_final, y_pred_test))

print("\nClassification report (test):")
print(classification_report(y_test_final, y_pred_test, digits=4))


## Feature Importance

In [None]:
# 1. Extract GAIN importance
booster = best_xgb.get_booster()
importance_raw = booster.get_score(importance_type="gain")

# Use feature names directly (since your model stores real names)
importance_dict = importance_raw

feat_imp = (
    pd.DataFrame({
        "feature": list(importance_dict.keys()),
        "importance": list(importance_dict.values())
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

topN = 20
df_plot = feat_imp.head(topN)   # <-- no reverse, biggest first (top)

# 2. Beautiful descending plot
plt.figure(figsize=(10, 9))
sns.set_style("whitegrid")

ax = sns.barplot(
    data=df_plot,
    x="importance",
    y="feature",
    palette="viridis"
)

# Add value labels
for i, v in enumerate(df_plot["importance"]):
    ax.text(
        v * 1.01,
        i,
        f"{v:.1f}",
        va="center",
        fontsize=9
    )

plt.title("XGBoost Feature Importance (Gain)", fontsize=18, weight="bold")
plt.xlabel("Importance (Gain)", fontsize=14)
plt.ylabel("Feature", fontsize=14)

plt.tight_layout()
plt.show()


In [None]:
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(X_test_final)

# Summary plot
shap.summary_plot(shap_values, X_test_final)


## PR Curve of the Test Set for Exploration

In [None]:
prec, rec, thr = precision_recall_curve(y_test, y_proba_test)

# thr has length = len(prec) - 1
# For each threshold thr[i], the corresponding point on curve is (rec[i+1], prec[i+1])


In [None]:

# Align with thresholds
prec_thr = prec[1:]   # length n
rec_thr  = rec[1:]    # length n

pr_table = pd.DataFrame({
    "threshold": thr,       # length n
    "precision": prec_thr,  # length n
    "recall": rec_thr,      # length n
})


# Optional: sort by threshold (ascending)
pr_table = pr_table.sort_values("threshold").reset_index(drop=True)

# Sort by F1 (highest first)
pr_table_top50 = pr_table.sort_values("precision", ascending=False).head(50)

print(pr_table_top50)

In [None]:

# Precompute metrics for each threshold to show in the title
thresholds = thr
points_x = rec[1:]   # recall for each threshold
points_y = prec[1:]  # precision for each threshold

f1_list = []
p_list = []
r_list = []

for t in thresholds:
    y_pred_t = (y_proba_test >= t).astype(int)
    p = precision_score(y_test, y_pred_t, zero_division=0)
    r = recall_score(y_test, y_pred_t, zero_division=0)
    f1 = f1_score(y_test, y_pred_t, zero_division=0)
    p_list.append(p)
    r_list.append(r)
    f1_list.append(f1)

# Base figure: PR curve + initial point
fig = go.Figure()

# PR curve
fig.add_trace(go.Scatter(
    x=rec,
    y=prec,
    mode="lines",
    name="PR curve"
))

# Initial point (use first threshold)
init_idx = 0
fig.add_trace(go.Scatter(
    x=[points_x[init_idx]],
    y=[points_y[init_idx]],
    mode="markers",
    marker=dict(size=12),
    name="Current threshold"
))

# Create frames for slider animation
frames = []
for i, t in enumerate(thresholds):
    frames.append(go.Frame(
        data=[
            # PR curve (unchanged)
            go.Scatter(x=rec, y=prec, mode="lines", name="PR curve"),
            # Moving point
            go.Scatter(
                x=[points_x[i]],
                y=[points_y[i]],
                mode="markers",
                marker=dict(size=12),
                name="Current threshold"
            )
        ],
        name=f"{t:.3f}",
        layout=go.Layout(
            title=(
                f"Precision–Recall Curve | "
                f"threshold={t:.3f} | "
                f"Precision={p_list[i]:.3f}, Recall={r_list[i]:.3f}, F1={f1_list[i]:.3f}"
            )
        )
    ))

fig.frames = frames

# Slider definition
steps = []
for i, t in enumerate(thresholds):
    step = dict(
        method="animate",
        args=[
            [f"{t:.3f}"],
            {
                "mode": "immediate",
                "frame": {"duration": 0, "redraw": True},
                "transition": {"duration": 0}
            }
        ],
        label=f"{t:.2f}",
    )
    steps.append(step)

sliders = [dict(
    active=0,
    currentvalue={"prefix": "Threshold: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    xaxis_title="Recall",
    yaxis_title="Precision",
    title=(
        f"Precision–Recall Curve | "
        f"threshold={thresholds[init_idx]:.3f} | "
        f"Precision={p_list[init_idx]:.3f}, Recall={r_list[init_idx]:.3f}, F1={f1_list[init_idx]:.3f}"
    ),
    width=800,
    height=600
)

fig.show()
