In [1]:
# =============================
# 1. IMPORTS
# =============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier

In [2]:
# =============================
# 2. LOAD DATA
# =============================
train = pd.read_csv("/kaggle/input/likitha-preprocessing/train_processed.csv")
test  = pd.read_csv("/kaggle/input/likitha-preprocessing/test_processed.csv")

TARGET = "RiskFlag"
IDCOL = "ProfileID"

X = train.drop(columns=[TARGET, IDCOL])
y = train[TARGET].astype(int)

X_test = test.drop(columns=[IDCOL])
test_ID = test[IDCOL]

In [3]:
# =============================
# 3. TRAIN/VAL SPLIT
# =============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [4]:
# =============================
# 4. SCALING
# =============================
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

In [5]:
# =============================
# 5. FAST LINEAR SVM
# =============================
svm = SGDClassifier(
    loss="hinge",
    alpha=0.0001,
    class_weight={0:1, 1:3},
    max_iter=2000,
    random_state=42
)

svm.fit(X_train_s, y_train)

# Decision scores (meta feature)
train_svm_scores = svm.decision_function(X_train_s)
val_svm_scores   = svm.decision_function(X_val_s)
test_svm_scores  = svm.decision_function(X_test_s)

In [6]:
# =============================
# 6. STACK FEATURES FOR CATBOOST
# =============================
X_train_stack = np.column_stack([X_train_s, train_svm_scores])
X_val_stack   = np.column_stack([X_val_s, val_svm_scores])
X_test_stack  = np.column_stack([X_test_s, test_svm_scores])

In [7]:
# =============================
# 7. CATBOOST MODEL
# =============================
cat = CatBoostClassifier(
    iterations=600,
    depth=8,
    learning_rate=0.05,
    l2_leaf_reg=4,
    loss_function="Logloss",
    eval_metric="F1",
    class_weights=[1, 3.8],
    random_state=42,
    verbose=False
)

cat.fit(X_train_stack, y_train)

<catboost.core.CatBoostClassifier at 0x7d10eb8b4e50>

In [8]:
# =============================
# 8. VALIDATION
# =============================
val_proba = cat.predict_proba(X_val_stack)[:, 1]

best_t = 0.5
best_f1 = 0

for t in np.arange(0.05, 0.95, 0.01):
    preds = (val_proba >= t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("BEST THRESHOLD:", best_t)
print("BEST F1:", best_f1)

BEST THRESHOLD: 0.44000000000000006
BEST F1: 0.36524511757672384


In [9]:
# =============================
# 9. TRAIN ON FULL DATA
# =============================
X_full_s = scaler.fit_transform(X)
full_svm_scores = svm.decision_function(X_full_s)
X_full_stack = np.column_stack([X_full_s, full_svm_scores])

cat.fit(X_full_stack, y)

<catboost.core.CatBoostClassifier at 0x7d10eb8b4e50>

In [10]:
# =============================
# 10. TEST PREDICTION
# =============================
test_proba = cat.predict_proba(X_test_stack)[:, 1]
test_pred  = (test_proba >= best_t).astype(int)

In [11]:
# =============================
# 11. SAVE SUBMISSION
# =============================
submission = pd.DataFrame({
    "ProfileID": test_ID,
    "RiskFlag": test_pred
})

submission.to_csv("/kaggle/working/svm_catboost_stacked.csv", index=False)
print("Saved:", "/kaggle/working/svm_catboost_stacked.csv")

Saved: /kaggle/working/svm_catboost_stacked.csv
