In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

# --- –ö–û–ù–°–¢–ê–ù–¢–´ –ò –ü–ê–†–ê–ú–ï–¢–†–´ ---
TRANS_FILE = '../data/—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ –≤ –ú–æ–±–∏–ª—å–Ω–æ–º –∏–Ω—Ç–µ—Ä–Ω–µ—Ç –ë–∞–Ω–∫–∏–Ω–≥–µ.csv'
BEHAV_FILE = '../data/–ø–æ–≤–µ–¥–µ–Ω—á–µ—Å–∫–∏–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã –∫–ª–∏–µ–Ω—Ç–æ–≤.csv'
ENCODING = 'cp1251'
DELIMITER = ';'
DATE_FORMAT_SHORT = '%Y-%m-%d' 

FINAL_FEATURES_PATH = '../data/processed/transactions_with_features.csv'
GROUPING_KEY = 'user_id'
TIME_WINDOWS = ['1h', '12h', '24h']

print("--- –ó–ê–î–ê–ß–ê 1: –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –ø–µ—Ä–≤–∏—á–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞ (–î–µ–Ω—å 1) ---")

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
try:
    df_trans = pd.read_csv(TRANS_FILE, encoding=ENCODING, sep=DELIMITER, header=1) 
    df_behav = pd.read_csv(BEHAV_FILE, encoding=ENCODING, sep=DELIMITER)
except FileNotFoundError:
    print("‚ùå –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –û–®–ò–ë–ö–ê: –§–∞–π–ª—ã –Ω–µ –Ω–∞–π–¥–µ–Ω—ã!")
    raise

# 2. –û—á–∏—Å—Ç–∫–∞ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –∏–º–µ–Ω –∫–æ–ª–æ–Ω–æ–∫
def clean_cols(df):
    df.columns = df.columns.astype(str).str.replace(' ', '_').str.lower().str.replace('"', '').str.strip()
    df.columns = df.columns.str.replace(r'[^\w]+', '_', regex=True).str.strip('_')
    return df

df_trans = clean_cols(df_trans)
df_behav = clean_cols(df_behav)

# 3. –û—á–∏—Å—Ç–∫–∞ –∫–æ–ª–æ–Ω–∫–∏ —Å—É–º–º—ã (amount)
df_trans = df_trans[df_trans['target'].astype(str) != 'nan'].copy() 
df_trans['amount'] = df_trans['amount'].astype(str).str.replace(r'[\.,]', '', regex=True).astype(float)


# 4. –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –¥–∞—Ç –∏ –ø–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞–Ω–∏–µ
df_trans['timestamp'] = pd.to_datetime(df_trans['transdatetime'], errors='coerce')

df_behav.rename(columns={'–¥–∞—Ç–∞_—Å–æ–≤–µ—Ä—à–µ–Ω–Ω–æ–π_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏': 'date_behav'}, inplace=True)
df_trans.rename(columns={'transdate': 'date_trans'}, inplace=True) 

df_behav['date_behav'] = pd.to_datetime(df_behav['date_behav'], format=DATE_FORMAT_SHORT, errors='coerce')
df_trans['date_trans'] = pd.to_datetime(df_trans['date_trans'], format=DATE_FORMAT_SHORT, errors='coerce')

# 5. –ü–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞–Ω–∏–µ ID –∫–ª–∏–µ–Ω—Ç–∞
df_behav.rename(columns={'—É–Ω–∏–∫–∞–ª—å–Ω—ã–π_–∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä_–∫–ª–∏–µ–Ω—Ç–∞': 'user_id'}, inplace=True)
df_trans.rename(columns={'cst_dim_id': 'user_id'}, inplace=True)


# –§–∏–∫—Å: –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ user_id –∫ —Å—Ç—Ä–æ–∫–æ–≤–æ–º—É —Ç–∏–ø—É
df_trans['user_id'] = df_trans['user_id'].astype(str)
df_behav['user_id'] = df_behav['user_id'].astype(str)


# --- –ó–ê–î–ê–ß–ê 2: –°–ª–∏—è–Ω–∏–µ –∏ —Ñ–∏–Ω–∞–ª—å–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞ (–î–µ–Ω—å 1-2) ---

df_merged = pd.merge(
    df_trans, 
    df_behav, 
    left_on=['user_id', 'date_trans'], 
    right_on=['user_id', 'date_behav'], 
    how='left',
    suffixes=('_trans', '_behav')
)

initial_rows = df_merged.shape[0]
df = df_merged[df_merged['timestamp'].notna()].copy()
rows_dropped = initial_rows - df.shape[0]

if df.shape[0] == 0:
    print(f"‚ùå –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –û–®–ò–ë–ö–ê: –ü–æ—Ç–µ—Ä—è–Ω—ã –í–°–ï {initial_rows} —Å—Ç—Ä–æ–∫ –ø—Ä–∏ –æ—á–∏—Å—Ç–∫–µ –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –º–µ—Ç–æ–∫. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ –∏—Å—Ö–æ–¥–Ω—ã–π —Ñ–∞–π–ª –Ω–∞ –Ω–∞–ª–∏—á–∏–µ –¥–∞–Ω–Ω—ã—Ö.")
    raise ValueError("DataFrame –ø—É—Å—Ç –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏ –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –º–µ—Ç–æ–∫.")

df.sort_values(by='timestamp', inplace=True)
df.drop(columns=['date_trans', 'transdatetime', 'date_behav', 'docno', 'direction'], inplace=True, errors='ignore')

df.fillna(0, inplace=True) 

print(f"‚úÖ –¢–∞–±–ª–∏—Ü—ã –æ–±—ä–µ–¥–∏–Ω–µ–Ω—ã –∏ –æ—á–∏—â–µ–Ω—ã. –†–∞–∑–º–µ—Ä: {df.shape}. –£–¥–∞–ª–µ–Ω–æ —Å—Ç—Ä–æ–∫ (–∏–∑-–∑–∞ NaT): {rows_dropped}")


# --- –ó–ê–î–ê–ß–ê 23: –°–æ–∑–¥–∞—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ –≤—Ä–µ–º–µ–Ω–∏ ---
print("\n--- –ó–ê–î–ê–ß–ê 23: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –≤—Ä–µ–º–µ–Ω–∏ ---")

# –ò–∑–≤–ª–µ–∫–∞–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–∑ timestamp
# –≠—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ–º–æ–≥–∞—é—Ç –º–æ–¥–µ–ª–∏ –ø–æ–Ω—è—Ç—å –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞
df['hour'] = df['timestamp'].dt.hour  # –ß–∞—Å –¥–Ω—è (0-23)
df['day_of_week'] = df['timestamp'].dt.dayofweek  # –î–µ–Ω—å –Ω–µ–¥–µ–ª–∏ (0=–ü–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫, 6=–í–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ)
df['day_of_month'] = df['timestamp'].dt.day  # –î–µ–Ω—å –º–µ—Å—è—Ü–∞ (1-31)
df['month'] = df['timestamp'].dt.month  # –ú–µ—Å—è—Ü (1-12)
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)  # –í—ã—Ö–æ–¥–Ω–æ–π –¥–µ–Ω—å (1) –∏–ª–∏ —Ä–∞–±–æ—á–∏–π (0)
df['is_night'] = ((df['hour'] >= 22) | (df['hour'] < 6)).astype(int)  # –ù–æ—á—å (22:00-06:00)

# –¶–∏–∫–ª–∏—á–µ—Å–∫–æ–µ –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–ª—è —á–∞—Å–∞ –∏ –¥–Ω—è –Ω–µ–¥–µ–ª–∏ (sin/cos) - –ø–æ–º–æ–≥–∞–µ—Ç –º–æ–¥–µ–ª–∏ –ø–æ–Ω—è—Ç—å —Ü–∏–∫–ª–∏—á–Ω–æ—Å—Ç—å
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ 13 –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: hour, day_of_week, day_of_month, month, is_weekend, is_night, –∏ —Ü–∏–∫–ª–∏—á–µ—Å–∫–∏–µ –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è")


# --- –ó–ê–î–ê–ß–ê 24: –°–æ–∑–¥–∞—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é (avg_amount, count) ---
print("\n--- –ó–ê–î–ê–ß–ê 24: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é ---")

# –ì–ª–æ–±–∞–ª—å–Ω—ã–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é (–≤—ã—á–∏—Å–ª—è—é—Ç—Å—è –ø–æ –≤—Å–µ–º –µ–≥–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º –¥–æ —Ç–µ–∫—É—â–µ–π)
# –≠—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ–º–æ–≥–∞—é—Ç –ø–æ–Ω—è—Ç—å —Ç–∏–ø–∏—á–Ω–æ–µ –ø–æ–≤–µ–¥–µ–Ω–∏–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è

# –°–æ–∑–¥–∞–µ–º –∫–æ–ø–∏—é –¥–ª—è –±–µ–∑–æ–ø–∞—Å–Ω—ã—Ö –≤—ã—á–∏—Å–ª–µ–Ω–∏–π
df_sorted = df.sort_values(by=['user_id', 'timestamp']).copy()

# –ü—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é (–∏—Å–ø–æ–ª—å–∑—É–µ–º expanding window –¥–ª—è –Ω–∞–∫–æ–ø–ª–µ–Ω–∏—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏)
df_sorted['user_tx_count_total'] = df_sorted.groupby(GROUPING_KEY).cumcount() + 1  # –û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –¥–æ —Ç–µ–∫—É—â–µ–π
df_sorted['user_avg_amount_total'] = df_sorted.groupby(GROUPING_KEY)['amount'].expanding().mean().reset_index(level=0, drop=True)
df_sorted['user_std_amount_total'] = df_sorted.groupby(GROUPING_KEY)['amount'].expanding().std().reset_index(level=0, drop=True)
df_sorted['user_max_amount_total'] = df_sorted.groupby(GROUPING_KEY)['amount'].expanding().max().reset_index(level=0, drop=True)
df_sorted['user_min_amount_total'] = df_sorted.groupby(GROUPING_KEY)['amount'].expanding().min().reset_index(level=0, drop=True)

# –û—Ç–∫–ª–æ–Ω–µ–Ω–∏–µ —Ç–µ–∫—É—â–µ–π —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ –æ—Ç —Å—Ä–µ–¥–Ω–µ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
df_sorted['amount_diff_from_user_avg'] = df_sorted['amount'] - df_sorted['user_avg_amount_total']
df_sorted['amount_ratio_to_user_avg'] = df_sorted['amount'] / (df_sorted['user_avg_amount_total'] + 1e-8)  # +1e-8 —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –¥–µ–ª–µ–Ω–∏—è –Ω–∞ 0

# –ó–∞–ø–æ–ª–Ω—è–µ–º –ø—Ä–æ–ø—É—Å–∫–∏ –¥–ª—è –ø–µ—Ä–≤–æ–π —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
df_sorted['user_std_amount_total'].fillna(0, inplace=True)
df_sorted['amount_diff_from_user_avg'].fillna(0, inplace=True)
df_sorted['amount_ratio_to_user_avg'].fillna(1, inplace=True)

# –û–±–Ω–æ–≤–ª—è–µ–º –æ—Å–Ω–æ–≤–Ω–æ–π –¥–∞—Ç–∞—Ñ—Ä–µ–π–º
df = df_sorted.copy()

print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ 8 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é: tx_count, avg/std/max/min amount, –æ—Ç–∫–ª–æ–Ω–µ–Ω–∏—è –æ—Ç —Å—Ä–µ–¥–Ω–µ–≥–æ")


# --- –ó–ê–î–ê–ß–ê 25: –°–æ–∑–¥–∞—Ç—å rolling-window –ø—Ä–∏–∑–Ω–∞–∫–∏ ---
print("\n--- –ó–ê–î–ê–ß–ê 25: –°–æ–∑–¥–∞–Ω–∏–µ rolling-window –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---")

# Rolling Window Features - —Å–∫–æ–ª—å–∑—è—â–∏–µ –æ–∫–Ω–∞ –ø–æ –≤—Ä–µ–º–µ–Ω–∏
# –≠—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ–∫–∞–∑—ã–≤–∞—é—Ç –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç—å –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –≤ –ø–æ—Å–ª–µ–¥–Ω–∏–µ N —á–∞—Å–æ–≤
for window in TIME_WINDOWS:
    # üî• –§–ò–ù–ê–õ–¨–ù–´–ô –§–ò–ö–°: –ò—Å–ø–æ–ª—å–∑—É–µ–º 'amount' –¥–ª—è –ø–æ–¥—Å—á–µ—Ç–∞ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π, —Ç–∞–∫ –∫–∞–∫ 'user_id' –º–æ–∂–µ—Ç –±—ã—Ç—å –∏—Å–∫–ª—é—á–µ–Ω –∏–∑ –≥—Ä—É–ø–ø—ã.
    df[f'tx_count_{window}'] = df.groupby(GROUPING_KEY).apply(
        lambda x: x.rolling(window=window, on='timestamp', closed='left')['amount'].count(),
        include_groups=False 
    ).reset_index(level=0, drop=True)
    
    df[f'tx_mean_amount_{window}'] = df.groupby(GROUPING_KEY).apply(
        lambda x: x.rolling(window=window, on='timestamp', closed='left')['amount'].mean(),
        include_groups=False 
    ).reset_index(level=0, drop=True)
    
    df[f'tx_std_amount_{window}'] = df.groupby(GROUPING_KEY).apply(
        lambda x: x.rolling(window=window, on='timestamp', closed='left')['amount'].std(),
        include_groups=False 
    ).reset_index(level=0, drop=True)
    
    # –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ rolling –ø—Ä–∏–∑–Ω–∞–∫–∏
    df[f'tx_sum_amount_{window}'] = df.groupby(GROUPING_KEY).apply(
        lambda x: x.rolling(window=window, on='timestamp', closed='left')['amount'].sum(),
        include_groups=False 
    ).reset_index(level=0, drop=True)
    
    df[f'tx_max_amount_{window}'] = df.groupby(GROUPING_KEY).apply(
        lambda x: x.rolling(window=window, on='timestamp', closed='left')['amount'].max(),
        include_groups=False 
    ).reset_index(level=0, drop=True)

print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ {len(TIME_WINDOWS) * 5} rolling-window –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è –æ–∫–æ–Ω {TIME_WINDOWS}")


# --- –ó–ê–î–ê–ß–ê 26: –°–æ–∑–¥–∞—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ ---
print("\n--- –ó–ê–î–ê–ß–ê 26: –°–æ–∑–¥–∞–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---")

# Z-score –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (–æ—Ç–∫–ª–æ–Ω–µ–Ω–∏–µ –æ—Ç —Å—Ä–µ–¥–Ω–µ–≥–æ –≤ –µ–¥–∏–Ω–∏—Ü–∞—Ö —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–≥–æ –æ—Ç–∫–ª–æ–Ω–µ–Ω–∏—è)
# Z-score –ø–æ–º–æ–≥–∞–µ—Ç –≤—ã—è–≤–∏—Ç—å –∞–Ω–æ–º–∞–ª–∏–∏

# Z-score –¥–ª—è amount –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ –≥–ª–æ–±–∞–ª—å–Ω–æ–≥–æ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è
global_amount_mean = df['amount'].mean()
global_amount_std = df['amount'].std() + 1e-8
df['amount_zscore'] = (df['amount'] - global_amount_mean) / global_amount_std

# Z-score –¥–ª—è amount –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è (–∏—Å–ø–æ–ª—å–∑—É–µ–º —É–∂–µ –≤—ã—á–∏—Å–ª–µ–Ω–Ω—ã–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏)
df['amount_zscore_user'] = df['amount_diff_from_user_avg'] / (df['user_std_amount_total'] + 1e-8)

# –ü—Ä–æ—Ü–µ–Ω—Ç–∏–ª–∏ amount
df['amount_percentile'] = df['amount'].rank(pct=True)

# –û—Ç–Ω–æ—à–µ–Ω–∏–µ —Ç–µ–∫—É—â–µ–π —Å—É–º–º—ã –∫ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π/–º–∏–Ω–∏–º–∞–ª—å–Ω–æ–π –≤ –æ–∫–Ω–µ
for window in TIME_WINDOWS:
    mean_col = f'tx_mean_amount_{window}'
    max_col = f'tx_max_amount_{window}'
    if mean_col in df.columns and max_col in df.columns:
        df[f'amount_ratio_to_max_{window}'] = df['amount'] / (df[max_col] + 1e-8)
        df[f'amount_ratio_to_mean_{window}'] = df['amount'] / (df[mean_col] + 1e-8)

# –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –≤–∞—Ä–∏–∞—Ü–∏–∏ (std/mean) –¥–ª—è rolling –æ–∫–æ–Ω
for window in TIME_WINDOWS:
    mean_col = f'tx_mean_amount_{window}'
    std_col = f'tx_std_amount_{window}'
    if mean_col in df.columns and std_col in df.columns:
        df[f'cv_amount_{window}'] = df[std_col] / (df[mean_col] + 1e-8)  # Coefficient of Variation

print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: z-scores, –ø—Ä–æ—Ü–µ–Ω—Ç–∏–ª–∏, –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç—ã –≤–∞—Ä–∏–∞—Ü–∏–∏")


# --- –ó–ê–î–ê–ß–ê 27: –°–æ–∑–¥–∞—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–∏ device-based & geo-based ---
print("\n--- –ó–ê–î–ê–ß–ê 27: –°–æ–∑–¥–∞–Ω–∏–µ device-based & geo-based –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---")

# –ü–æ–∏—Å–∫ –∫–æ–ª–æ–Ω–æ–∫, —Å–≤—è–∑–∞–Ω–Ω—ã—Ö —Å —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞–º–∏
device_cols = []
for col in df.columns:
    col_lower = col.lower()
    if 'phone' in col_lower or 'model' in col_lower or 'device' in col_lower or '—Ç–µ–ª–µ—Ñ–æ–Ω' in col_lower or '—É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ' in col_lower:
        device_cols.append(col)

os_cols = []
for col in df.columns:
    col_lower = col.lower()
    if 'os' in col_lower or '–≤–µ—Ä—Å–∏—è' in col_lower:
        os_cols.append(col)

# –ü–æ–∏—Å–∫ –∫–æ–ª–æ–Ω–æ–∫ —Å IP/–≥–µ–æ–ª–æ–∫–∞—Ü–∏–µ–π
ip_cols = []
geo_cols = []
for col in df.columns:
    col_lower = col.lower()
    if 'ip' in col_lower or '–∞–¥—Ä–µ—Å' in col_lower:
        ip_cols.append(col)
    if 'geo' in col_lower or 'location' in col_lower or '—Å—Ç—Ä–∞–Ω–∞' in col_lower or '–≥–æ—Ä–æ–¥' in col_lower:
        geo_cols.append(col)

print(f"–ù–∞–π–¥–µ–Ω–æ –∫–æ–ª–æ–Ω–æ–∫: —É—Å—Ç—Ä–æ–π—Å—Ç–≤={len(device_cols)}, OS={len(os_cols)}, IP={len(ip_cols)}, –≥–µ–æ–ª–æ–∫–∞—Ü–∏—è={len(geo_cols)}")

# Device-based –ø—Ä–∏–∑–Ω–∞–∫–∏
if len(device_cols) > 0:
    device_col = device_cols[0]
    # –ß–∞—Å—Ç–æ—Ç–∞ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ (frequency encoding)
    device_counts = df[device_col].value_counts()
    df['device_freq'] = df[device_col].map(device_counts) / len(df)
    
    # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —É—Å—Ç—Ä–æ–π—Å—Ç–≤ –Ω–∞ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
    df['user_unique_devices'] = df.groupby(GROUPING_KEY)[device_col].transform('nunique')
    
    # –ò–∑–º–µ–Ω–µ–Ω–∏–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ (1 –µ—Å–ª–∏ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ –∏–∑–º–µ–Ω–∏–ª–æ—Å—å —Å –ø—Ä–µ–¥—ã–¥—É—â–µ–π —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏)
    df['device_changed'] = (df.groupby(GROUPING_KEY)[device_col].shift(1) != df[device_col]).astype(int)
    df['device_changed'].fillna(0, inplace=True)
    
    print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ 3 device-based –ø—Ä–∏–∑–Ω–∞–∫–∞ –∏–∑ –∫–æ–ª–æ–Ω–∫–∏ {device_col}")

# OS-based –ø—Ä–∏–∑–Ω–∞–∫–∏
if len(os_cols) > 0:
    os_col = os_cols[0]
    # –ß–∞—Å—Ç–æ—Ç–∞ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è OS –≤–µ—Ä—Å–∏–∏
    os_counts = df[os_col].value_counts()
    df['os_freq'] = df[os_col].map(os_counts) / len(df)
    
    # –ò–∑–º–µ–Ω–µ–Ω–∏–µ OS –≤–µ—Ä—Å–∏–∏
    df['os_changed'] = (df.groupby(GROUPING_KEY)[os_col].shift(1) != df[os_col]).astype(int)
    df['os_changed'].fillna(0, inplace=True)
    
    print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ 2 OS-based –ø—Ä–∏–∑–Ω–∞–∫–∞ –∏–∑ –∫–æ–ª–æ–Ω–∫–∏ {os_col}")

# IP-based –ø—Ä–∏–∑–Ω–∞–∫–∏ (–µ—Å–ª–∏ –µ—Å—Ç—å)
if len(ip_cols) > 0:
    ip_col = ip_cols[0]
    # –ß–∞—Å—Ç–æ—Ç–∞ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è IP
    ip_counts = df[ip_col].value_counts()
    df['ip_freq'] = df[ip_col].map(ip_counts) / len(df)
    
    # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö IP –Ω–∞ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
    df['user_unique_ips'] = df.groupby(GROUPING_KEY)[ip_col].transform('nunique')
    
    # –ò–∑–º–µ–Ω–µ–Ω–∏–µ IP –∞–¥—Ä–µ—Å–∞
    df['ip_changed'] = (df.groupby(GROUPING_KEY)[ip_col].shift(1) != df[ip_col]).astype(int)
    df['ip_changed'].fillna(0, inplace=True)
    
    print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ 3 IP-based –ø—Ä–∏–∑–Ω–∞–∫–∞ –∏–∑ –∫–æ–ª–æ–Ω–∫–∏ {ip_col}")
else:
    print("‚ö†Ô∏è IP –∞–¥—Ä–µ—Å–∞ –Ω–µ –Ω–∞–π–¥–µ–Ω—ã –≤ –¥–∞–Ω–Ω—ã—Ö")

# Geo-based –ø—Ä–∏–∑–Ω–∞–∫–∏ (–µ—Å–ª–∏ –µ—Å—Ç—å)
if len(geo_cols) > 0:
    geo_col = geo_cols[0]
    # –ß–∞—Å—Ç–æ—Ç–∞ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –ª–æ–∫–∞—Ü–∏–∏
    geo_counts = df[geo_col].value_counts()
    df['geo_freq'] = df[geo_col].map(geo_counts) / len(df)
    
    # –ò–∑–º–µ–Ω–µ–Ω–∏–µ –ª–æ–∫–∞—Ü–∏–∏
    df['geo_changed'] = (df.groupby(GROUPING_KEY)[geo_col].shift(1) != df[geo_col]).astype(int)
    df['geo_changed'].fillna(0, inplace=True)
    
    print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ 2 geo-based –ø—Ä–∏–∑–Ω–∞–∫–∞ –∏–∑ –∫–æ–ª–æ–Ω–∫–∏ {geo_col}")
else:
    print("‚ö†Ô∏è –ì–µ–æ–ª–æ–∫–∞—Ü–∏–æ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –Ω–µ –Ω–∞–π–¥–µ–Ω—ã –≤ –¥–∞–Ω–Ω—ã—Ö")


# --- –ó–ê–î–ê–ß–ê 28: –°–æ–∑–¥–∞—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ–≤–µ–¥–µ–Ω–∏—è (—Å–∫–æ—Ä–æ—Å—Ç—å –æ–ø–µ—Ä–∞—Ü–∏–π) ---
print("\n--- –ó–ê–î–ê–ß–ê 28: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ–≤–µ–¥–µ–Ω–∏—è (—Å–∫–æ—Ä–æ—Å—Ç—å –æ–ø–µ—Ä–∞—Ü–∏–π) ---")

# Lag Feature (time_since_last_tx) - —É–∂–µ –±—ã–ª–æ, –Ω–æ –¥–æ–±–∞–≤–∏–º –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ
df['time_since_last_tx'] = df.groupby(GROUPING_KEY)['timestamp'].diff().dt.total_seconds()
df['time_since_last_tx'].fillna(0, inplace=True)

# –í—Ä–µ–º—è –¥–æ —Å–ª–µ–¥—É—é—â–µ–π —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ (forward-looking, –Ω–æ –∏—Å–ø–æ–ª—å–∑—É–µ–º –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞ –ø–∞—Ç—Ç–µ—Ä–Ω–æ–≤)
df['time_until_next_tx'] = df.groupby(GROUPING_KEY)['timestamp'].diff(-1).dt.total_seconds() * -1
df['time_until_next_tx'].fillna(0, inplace=True)

# –°–∫–æ—Ä–æ—Å—Ç—å —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π (–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –≤ —á–∞—Å)
for window in TIME_WINDOWS:
    count_col = f'tx_count_{window}'
    if count_col in df.columns:
        # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –æ–∫–Ω–æ –≤ —á–∞—Å—ã –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ —Å–∫–æ—Ä–æ—Å—Ç–∏
        hours_map = {'1h': 1, '12h': 12, '24h': 24}
        hours = hours_map.get(window, 1)
        df[f'tx_rate_per_hour_{window}'] = df[count_col] / hours

# –°—Ä–µ–¥–Ω–∏–π –∏–Ω—Ç–µ—Ä–≤–∞–ª –º–µ–∂–¥—É —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è (rolling)
for window in TIME_WINDOWS:
    hours_map = {'1h': 1, '12h': 12, '24h': 24}
    hours = hours_map.get(window, 1)
    # –°—Ä–µ–¥–Ω–∏–π –∏–Ω—Ç–µ—Ä–≤–∞–ª = –æ–∫–Ω–æ / –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π
    count_col = f'tx_count_{window}'
    if count_col in df.columns:
        df[f'avg_interval_{window}'] = (hours * 3600) / (df[count_col] + 1)  # –≤ —Å–µ–∫—É–Ω–¥–∞—Ö

# –ê–Ω–æ–º–∞–ª–∏–∏ –≤ —Å–∫–æ—Ä–æ—Å—Ç–∏ (—Å–ª–∏—à–∫–æ–º –±—ã—Å—Ç—Ä–æ –∏–ª–∏ —Å–ª–∏—à–∫–æ–º –º–µ–¥–ª–µ–Ω–Ω–æ)
# –ï—Å–ª–∏ –∏–Ω—Ç–µ—Ä–≤–∞–ª –º–µ–∂–¥—É —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ –æ—á–µ–Ω—å –º–∞–ª–µ–Ω—å–∫–∏–π - –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω–æ
df['is_rapid_tx'] = (df['time_since_last_tx'] < 60).astype(int)  # –ú–µ–Ω—å—à–µ –º–∏–Ω—É—Ç—ã –º–µ–∂–¥—É —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏

# –ï—Å–ª–∏ –∏–Ω—Ç–µ—Ä–≤–∞–ª –æ—á–µ–Ω—å –±–æ–ª—å—à–æ–π - —Ç–æ–∂–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω–æ (–Ω–µ–æ–±—ã—á–Ω–æ–µ –ø–æ–≤–µ–¥–µ–Ω–∏–µ)
user_avg_interval = df.groupby(GROUPING_KEY)['time_since_last_tx'].transform('mean')
df['interval_anomaly'] = (df['time_since_last_tx'] > user_avg_interval * 3).astype(int)  # –í 3 —Ä–∞–∑–∞ –±–æ–ª—å—à–µ —Å—Ä–µ–¥–Ω–µ–≥–æ

# –ß–∞—Å—Ç–æ—Ç–∞ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –≤ —Ä–∞–∑–Ω—ã–µ –ø–µ—Ä–∏–æ–¥—ã –¥–Ω—è
df['tx_in_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
df['tx_in_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
df['tx_in_evening'] = ((df['hour'] >= 18) & (df['hour'] < 22)).astype(int)

print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ–≤–µ–¥–µ–Ω–∏—è: –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –∏–Ω—Ç–µ—Ä–≤–∞–ª—ã, —Å–∫–æ—Ä–æ—Å—Ç—å —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π, –∞–Ω–æ–º–∞–ª–∏–∏ —Å–∫–æ—Ä–æ—Å—Ç–∏")


# --- –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ: Frequency Encoding –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---
print("\n--- –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ: Frequency Encoding –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---")

# 3. Frequency Encoding –¥–ª—è 'recipient_id' (–µ—Å–ª–∏ –µ—Å—Ç—å)
try:
    recipient_col = [col for col in df.columns if '–ø–æ–ª—É—á–∞—Ç–µ–ª—è' in col or 'recipient' in col.lower()][0]
    df[recipient_col].fillna('UNKNOWN_RECIPIENT', inplace=True)
    recipient_counts = df[recipient_col].value_counts()
    df['recipient_freq_encoding'] = df[recipient_col].map(recipient_counts) / len(df)
    df.drop(columns=[recipient_col], inplace=True, errors='ignore')
    print(f"‚úÖ –°–æ–∑–¥–∞–Ω frequency encoding –¥–ª—è {recipient_col}")
except (IndexError, KeyError):
    print("‚ö†Ô∏è –ü—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–µ: –ö–æ–ª–æ–Ω–∫–∞ '–ü–æ–ª—É—á–∞—Ç–µ–ª—å' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–∞ recipient_freq_encoding –ø—Ä–æ–ø—É—â–µ–Ω–æ.")
    df['recipient_freq_encoding'] = 0


# --- –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---
print("\n--- –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---")

# –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (–∫—Ä–æ–º–µ –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö —Ü–∏–∫–ª–∏—á–µ—Å–∫–∏—Ö –∏ –±–∏–Ω–∞—Ä–Ω—ã—Ö)
cols_to_match = [
    'amount', 'time_since_last_tx', 'time_until_next_tx', 'recipient_freq_encoding',
    'user_avg_amount_total', 'user_std_amount_total', 'user_max_amount_total', 'user_min_amount_total',
    'amount_zscore', 'amount_zscore_user', 'amount_percentile',
    'device_freq', 'os_freq', 'ip_freq', 'geo_freq',
    'tx_rate_per_hour', 'avg_interval'
]

# –°–æ–±–∏—Ä–∞–µ–º –≤—Å–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –¥–ª—è –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è
features_to_scale = []
for col in df.columns:
    # –ò—Å–∫–ª—é—á–∞–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–µ —Ü–∏–∫–ª–∏—á–µ—Å–∫–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ (sin/cos), –±–∏–Ω–∞—Ä–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏, –∏ target
    if any(match in col.lower() for match in ['_sin', '_cos', 'is_', 'changed', 'target', 'user_id', 'timestamp']):
        continue
    # –í–∫–ª—é—á–∞–µ–º —á–∏—Å–ª–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    if df[col].dtype in [np.int64, np.int32, np.float64, np.float32]:
        if col not in ['hour', 'day_of_week', 'day_of_month', 'month']:  # –ò—Å—Ö–æ–¥–Ω—ã–µ –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –Ω–µ –º–∞—Å—à—Ç–∞–±–∏—Ä—É–µ–º
            features_to_scale.append(col)

# –î–æ–±–∞–≤–ª—è–µ–º rolling window –ø—Ä–∏–∑–Ω–∞–∫–∏
for window in TIME_WINDOWS:
    for suffix in ['count', 'mean', 'std', 'sum', 'max']:
        col = f'tx_{suffix}_amount_{window}'
        if col in df.columns:
            features_to_scale.append(col)
    for suffix in ['rate_per_hour', 'avg_interval']:
        col = f'tx_{suffix}_{window}'
        if col in df.columns:
            features_to_scale.append(col)

# –î–æ–±–∞–≤–ª—è–µ–º —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
stat_cols = [col for col in df.columns if any(x in col for x in ['zscore', 'percentile', 'ratio', 'cv_amount'])]
features_to_scale.extend(stat_cols)

# –£–±–∏—Ä–∞–µ–º –¥—É–±–ª–∏–∫–∞—Ç—ã –∏ –ø—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ
features_to_scale = list(set([col for col in features_to_scale if col in df.columns]))

# –ò—Å–∫–ª—é—á–∞–µ–º target –µ—Å–ª–∏ –æ–Ω –µ—Å—Ç—å
if 'target' in features_to_scale:
    features_to_scale.remove('target')

print(f"–ù–∞–π–¥–µ–Ω–æ {len(features_to_scale)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è")

if len(features_to_scale) > 0:
    scaler = MinMaxScaler()
    df_features = df[features_to_scale].copy()
    df[features_to_scale] = scaler.fit_transform(df_features)
    print(f"‚úÖ –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–æ {len(features_to_scale)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤.")
else:
    print("‚ö†Ô∏è –ü—Ä–∏–∑–Ω–∞–∫–∏ –¥–ª—è –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è –Ω–µ –Ω–∞–π–¥–µ–Ω—ã")


# --- –§–ò–ù–ê–õ–¨–ù–û–ï –°–û–•–†–ê–ù–ï–ù–ò–ï ---
print("\n--- –§–∏–Ω–∞–ª—å–Ω–æ–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±–æ–≥–∞—â–µ–Ω–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ ---")

# –ó–∞–ø–æ–ª–Ω—è–µ–º –æ—Å—Ç–∞–≤—à–∏–µ—Å—è –ø—Ä–æ–ø—É—Å–∫–∏
df.fillna(0, inplace=True)

# –£–¥–∞–ª—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –Ω—É–∂–Ω—ã –¥–ª—è –º–æ–¥–µ–ª–∏ (–æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –ø—Ä–∏–∑–Ω–∞–∫–∏)
cols_to_drop = ['timestamp']  # timestamp –±–æ–ª—å—à–µ –Ω–µ –Ω—É–∂–µ–Ω, –≤—Å–µ –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–∑–≤–ª–µ—á–µ–Ω—ã
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

output_dir = os.path.dirname(FINAL_FEATURES_PATH)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)

df.to_csv(FINAL_FEATURES_PATH, index=False)

print(f"\n‚úÖ –§–ò–ù–ê–õ–¨–ù–´–ô –û–ë–û–ì–ê–©–ï–ù–ù–´–ô –î–ê–¢–ê–°–ï–¢ –ì–û–¢–û–í: {FINAL_FEATURES_PATH}")
print(f"–§–∏–Ω–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä —Ç–∞–±–ª–∏—Ü—ã —Å –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏: {df.shape}")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (–±–µ–∑ target): {len([c for c in df.columns if c != 'target'])}")

# –°–≤–æ–¥–∫–∞ –ø–æ —Å–æ–∑–¥–∞–Ω–Ω—ã–º –ø—Ä–∏–∑–Ω–∞–∫–∞–º
print(f"\nüìä –°–í–û–î–ö–ê –ü–û –°–û–ó–î–ê–ù–ù–´–ú –ü–†–ò–ó–ù–ê–ö–ê–ú:")
print(f"  - –í—Ä–µ–º–µ–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏: ~13 (—á–∞—Å, –¥–µ–Ω—å, —Ü–∏–∫–ª–∏—á–µ—Å–∫–∏–µ –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è)")
print(f"  - –ü—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é: ~8 (—Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é)")
print(f"  - Rolling-window –ø—Ä–∏–∑–Ω–∞–∫–∏: ~{len(TIME_WINDOWS) * 5} (–¥–ª—è –æ–∫–æ–Ω {TIME_WINDOWS})")
print(f"  - –°—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–∏: ~{len([c for c in df.columns if 'zscore' in c or 'percentile' in c or 'ratio' in c or 'cv' in c])}")
print(f"  - Device/OS/IP/Geo –ø—Ä–∏–∑–Ω–∞–∫–∏: –∑–∞–≤–∏—Å–∏—Ç –æ—Ç –Ω–∞–ª–∏—á–∏—è –¥–∞–Ω–Ω—ã—Ö")
print(f"  - –ü—Ä–∏–∑–Ω–∞–∫–∏ –ø–æ–≤–µ–¥–µ–Ω–∏—è: ~10+ (—Å–∫–æ—Ä–æ—Å—Ç—å, –∏–Ω—Ç–µ—Ä–≤–∞–ª—ã, –∞–Ω–æ–º–∞–ª–∏–∏)")
print(f"\n‚úÖ –í—Å–µ –∑–∞–¥–∞—á–∏ Feature Engineering (23-28) –≤—ã–ø–æ–ª–Ω–µ–Ω—ã!")

--- –ó–ê–î–ê–ß–ê 1: –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –ø–µ—Ä–≤–∏—á–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞ (–î–µ–Ω—å 1) ---
‚úÖ –¢–∞–±–ª–∏—Ü—ã –æ–±—ä–µ–¥–∏–Ω–µ–Ω—ã –∏ –æ—á–∏—â–µ–Ω—ã. –†–∞–∑–º–µ—Ä: (13155, 21). –£–¥–∞–ª–µ–Ω–æ —Å—Ç—Ä–æ–∫ (–∏–∑-–∑–∞ NaT): 0

--- –ó–ê–î–ê–ß–ê 23: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –≤—Ä–µ–º–µ–Ω–∏ ---
‚úÖ –°–æ–∑–¥–∞–Ω–æ 13 –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: hour, day_of_week, day_of_month, month, is_weekend, is_night, –∏ —Ü–∏–∫–ª–∏—á–µ—Å–∫–∏–µ –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è

--- –ó–ê–î–ê–ß–ê 24: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é ---
‚úÖ –°–æ–∑–¥–∞–Ω–æ 8 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é: tx_count, avg/std/max/min amount, –æ—Ç–∫–ª–æ–Ω–µ–Ω–∏—è –æ—Ç —Å—Ä–µ–¥–Ω–µ–≥–æ

--- –ó–ê–î–ê–ß–ê 25: –°–æ–∑–¥–∞–Ω–∏–µ rolling-window –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---
‚úÖ –°–æ–∑–¥–∞–Ω–æ 15 rolling-window –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è –æ–∫–æ–Ω ['1h', '12h', '24h']

--- –ó–ê–î–ê–ß–ê 26: –°–æ–∑–¥–∞–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ ---
‚úÖ –°–æ–∑–¥–∞–Ω–æ —Å—Ç