
# Task 5 – Account Security Monitoring (Anomaly Detection)

In [None]:

# ================================================
# 2. Imports and Path Definitions

import os
from pathlib import Path
import datetime
import json

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

BASE_DIR = Path("/content/data")
AD_DIR = BASE_DIR / "anomalyDetection"
AD_DIR.mkdir(exist_ok=True)

PATH_SAMPLE_SUB = BASE_DIR / "sample_submission.csv"
PATH_TEST = BASE_DIR / "test.csv"
PATH_BASELINE = BASE_DIR / "baseLine.csv"

TRAIN_VAL_CANDIDATES = [
    (BASE_DIR / "src" / "train.csv", BASE_DIR / "src" / "val.csv"),
    (BASE_DIR / "train.csv", BASE_DIR / "val.csv"),
]

TARGET_COL = "is_anomaly"  # from datacard
ID_COLS_CANDIDATES = ["id", "player_id"]  # we will auto-detect later

print("AD_DIR:", AD_DIR)
print("Sample submission:", PATH_SAMPLE_SUB.exists())
print("Test:", PATH_TEST.exists())
print("Baseline:", PATH_BASELINE.exists())

AD_DIR: /content/data/anomalyDetection
Sample submission: True
Test: True
Baseline: True


In [ ]:
# IMPORTANT: Safe JSON dumps for numpy types
import json

def json_safe_dumps(obj, **kwargs):
    def default(o):
        try:
            import numpy as np
            if isinstance(o, (np.integer,)):
                return int(o)
            if isinstance(o, (np.floating,)):
                return float(o)
            if isinstance(o, (np.ndarray,)):
                return o.tolist()
        except Exception:
            pass
        try:
            return o.__dict__
        except Exception:
            return str(o)
    return json_safe_dumps(obj, default=default, **kwargs)

In [None]:

# ================================================
# 3. Policy Enforcement: Require Train/Val for Task 5

train_path = None
val_path = None

for t_path, v_path in TRAIN_VAL_CANDIDATES:
    if t_path.exists() and v_path.exists():
        train_path, val_path = t_path, v_path
        break

if train_path is None or val_path is None:
    raise SystemExit(
        "❌ Policy Enforcement: No Train/Val pair found.\n"
        "Expected one of:\n"
        " - /mnt/data/src/train.csv and /mnt/data/src/val.csv\n"
        " - /mnt/data/train.csv and /mnt/data/val.csv\n"
        "Please make sure one of these pairs exists before running the notebook."
    )
else:
    print(f"✅ Using train file: {train_path}")
    print(f"✅ Using val file  : {val_path}")

✅ Using train file: /content/data/src/train.csv
✅ Using val file  : /content/data/src/val.csv


In [None]:

# ================================================
# 4. Load Train/Val, Test, Baseline, Sample Submission

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

print("Train shape:", train_df.shape)
print("Val   shape:", val_df.shape)

test_df = pd.read_csv(PATH_TEST)
print("Test  shape:", test_df.shape)

baseline_df = pd.read_csv(PATH_BASELINE)
print("Baseline shape:", baseline_df.shape)
print("Baseline columns:", baseline_df.columns.tolist())

# Load sample submission as the authoritative template for id order + columns
sample_sub_df = pd.read_csv(PATH_SAMPLE_SUB)
print("Sample submission shape:", sample_sub_df.shape)
print("Sample submission columns:", sample_sub_df.columns.tolist())

Train shape: (200, 11)
Val   shape: (50, 11)
Test  shape: (25889, 124)
Baseline shape: (25889, 6)
Baseline columns: ['id', 'task1', 'task2', 'task3', 'task4', 'task5']
Sample submission shape: (25889, 6)
Sample submission columns: ['id', 'task1', 'task2', 'task3', 'task4', 'task5']


In [None]:
# ================================================
# 5. Feature Selection & Preprocessing
#    - NOTE: Train/Val (f1–f10) schema != Test schema

full_train = pd.concat([train_df, val_df], axis=0, ignore_index=True)

# Detect ID column if present (not used as a feature)
id_col = None
for c in ID_COLS_CANDIDATES:
    if c in full_train.columns or c in test_df.columns:
        id_col = c
        break

print("Detected ID column:", id_col)

feature_cols = [
    col for col in test_df.columns
    if np.issubdtype(test_df[col].dtype, np.number)
]

print("Using TEST feature space for unsupervised anomaly detection")
print("Number of feature columns:", len(feature_cols))
print("First 10 features:", feature_cols[:10])

if not feature_cols:
    raise KeyError("❌ No numeric feature columns found in test_df for Task 5.")

preprocessor = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", RobustScaler()),
    ]
)

X_test = test_df[feature_cols]
X_test_trans = preprocessor.fit_transform(X_test)

print("Transformed Test shape:", X_test_trans.shape)

Detected ID column: id
Using TEST feature space for unsupervised anomaly detection
Number of feature columns: 122
First 10 features: ['login_count_1', 'login_count_2', 'login_count_3', 'login_count_4', 'login_lat_1', 'login_lon_1', 'login_lat_2', 'login_lon_2', 'login_lat_3', 'login_lon_3']
Transformed Test shape: (25889, 122)


In [None]:
# ================================================
# 6. Train Two Unsupervised Models from Scratch

RANDOM_STATE = 42
CONTAMINATION = 0.10  # expected anomaly proportion (tunable)
NU = 0.10             # OneClassSVM outlier fraction (tunable)

iso_model = IsolationForest(
    n_estimators=300,
    max_samples="auto", 
    contamination=CONTAMINATION,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

print("Fitting IsolationForest on TEST feature space (unsupervised)...")
iso_model.fit(X_test_trans)

ocsvm_model = OneClassSVM(
    kernel="rbf",
    gamma="scale",
    nu=NU,
)

print("Fitting OneClassSVM on TEST feature space (unsupervised)...")
ocsvm_model.fit(X_test_trans)

print("✅ Both models trained from scratch on TEST feature space (no labels, no pretraining, no AutoML).")

Fitting IsolationForest on TEST feature space (unsupervised)...
Fitting OneClassSVM on TEST feature space (unsupervised)...
✅ Both models trained from scratch on TEST feature space (no labels, no pretraining, no AutoML).


In [None]:
# ================================================
# 7. Inference on Test & Score Normalization

test_scores = pd.DataFrame(
    {
        "iso": -iso_model.decision_function(X_test_trans),
        "ocsvm": -ocsvm_model.decision_function(X_test_trans),
    },
    index=test_df.index,
)

print("Raw test score summary:")
print(test_scores.describe())

score_scaler = MinMaxScaler()
test_scores_scaled = score_scaler.fit_transform(test_scores)

ensemble_scores = test_scores_scaled.mean(axis=1)

print("Ensembled score summary (before thresholding):")
print(pd.Series(ensemble_scores).describe())

Raw test score summary:
                iso         ocsvm
count  25889.000000  25889.000000
mean      -0.021032    -45.698603
std        0.016932     49.815409
min       -0.076323   -197.229850
25%       -0.032538    -74.964931
50%       -0.022001    -50.650916
75%       -0.011120    -25.887511
max        0.080395    211.064094
Ensembled score summary (before thresholding):
count    25889.000000
mean         0.361971
std          0.108598
min          0.000000
25%          0.291539
50%          0.353879
75%          0.417181
max          0.925889
dtype: float64


In [None]:

# ================================================
# 8. Soft Ensemble -> Binary Anomaly Label (task5)

# Heuristic threshold: top CONTAMINATION fraction as anomalies
threshold = np.quantile(ensemble_scores, 1 - CONTAMINATION)

print(f"Using contamination={CONTAMINATION:.3f}")
print(f"Anomaly threshold on ensemble score: {threshold:.6f}")

task5_pred = (ensemble_scores >= threshold).astype(int)

print("Predicted anomaly label distribution (task5):")
print(pd.Series(task5_pred).value_counts().rename(index={0: "normal(0)", 1: "anomaly(1)"}))

Using contamination=0.100
Anomaly threshold on ensemble score: 0.484664
Predicted anomaly label distribution (task5):
normal(0)     23300
anomaly(1)     2589
Name: count, dtype: int64


In [None]:

# ================================================
# 9. Combine Task 1–4 Predictions from Baseline with Task 5 Results

required_cols = ["id", "task1", "task2", "task3", "task4", "task5"]

# Align baseline with sample submission by id
merged = sample_sub_df[["id"]].merge(
    baseline_df[required_cols],
    on="id",
    how="left",
    validate="one_to_one",
)

missing_rows = merged["task1"].isna().sum()
if missing_rows > 0:
    print(f"⚠️ Warning: {missing_rows} rows missing tasks 1–4/5 from baseline. "
          "They will remain NaN in those columns.")

if len(merged) != len(task5_pred):
    raise ValueError(
        f"Length mismatch between submission template ({len(merged)}) "
        f"and task5 predictions ({len(task5_pred)})."
    )

merged["task5"] = task5_pred

submission_df = merged[required_cols].copy()

print(submission_df.head())
print("Final submission shape:", submission_df.shape)

         id  task1  task2          task3  task4  task5
0  ANS00001      1      2     496.056891      0      0
1  ANS00002      0      1     904.887028      2      0
2  ANS00003      1      3  148248.266429      3      0
3  ANS00004      0      2      79.755910      0      0
4  ANS00005      1      0     260.016682      3      0
Final submission shape: (25889, 6)


In [None]:

# ================================================
# 10. Export submission.csv

submission_path = AD_DIR / "submission.csv"
submission_df.to_csv(submission_path, index=False)
print(f"✅ Saved submission to: {submission_path}")

✅ Saved submission to: /content/data/anomalyDetection/submission.csv


In [None]:

# ================================================
# 11. Save run info (RUN_INFO.txt)

run_info = {
    "timestamp": datetime.datetime.now().isoformat(),
    "train_path": str(train_path),
    "val_path": str(val_path),
    "test_path": str(PATH_TEST),
    "sample_submission_path": str(PATH_SAMPLE_SUB),
    "baseline_path": str(PATH_BASELINE),
    "feature_columns": feature_cols,
    "models": {
        "IsolationForest": {
            "n_estimators": int(iso_model.n_estimators),
            "contamination": float(CONTAMINATION),
            "random_state": int(RANDOM_STATE),
        },
        "OneClassSVM": {
            "kernel": "rbf",
            "gamma": "scale",
            "nu": float(NU),
        },
    },
    "preprocessing": {
        "imputer": "SimpleImputer(strategy='median')",
        "scaler": "RobustScaler()",
    },
    "ensemble": {
        "type": "soft-voting (mean of normalized anomaly scores)",
        "score_scaler": "MinMaxScaler(fit on Train+Val scores only)",
        "contamination": float(CONTAMINATION),
        "threshold": float(threshold),
    },
    "notes": [
        "Task 5 anomaly detection trained from scratch.",
        "No pre-trained models used.",
        "No AutoML frameworks used.",
        "No fitting of any model/transformer on Test features or scores.",
    ],
}

run_info_path = AD_DIR / "RUN_INFO.txt"
with open(run_info_path, "w") as f:
    f.write(json_safe_dumps(run_info, indent=2))

print(f"✅ Saved RUN_INFO.txt to: {run_info_path}")

✅ Saved RUN_INFO.txt to: /content/data/anomalyDetection/RUN_INFO.txt



---

## (Optional) Autoencoder – Deep Anomaly Detector (Commented Out)

> **Note:** The competition rules allow **Task 5** models to be trained **from scratch only**.  
> If you want to use an **Autoencoder (PyTorch or Keras)**, you **must train it only on Train/Val features** and **never fit on Test data**.

Below is a **template**, fully commented out, that you can adapt:
- Build a small fully-connected Autoencoder in PyTorch or Keras.
- Train it on `X_full_trans` (Train+Val) to reconstruct normal behavior.
- Use reconstruction error as an anomaly score on Train and Test.
- Then ensemble it with IsolationForest + OneClassSVM if desired.

Uncomment and modify only if you are sure it still follows all policies.
