In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

import lightgbm as lgb
from xgboost import XGBClassifier


In [2]:
train = pd.read_csv("/kaggle/input/processed-financial-risk/train_processed.csv")
test  = pd.read_csv("/kaggle/input/processed-financial-risk/test_processed.csv")

print(train.shape, test.shape)
train.head()


(204277, 27) (51070, 26)


Unnamed: 0,ProfileID,orig_index,ApplicantYears,AnnualEarnings,RequestedSum,TrustMetric,WorkDuration,ActiveAccounts,OfferRate,RepayPeriod,...,RelationshipStatus_Married,RelationshipStatus_Single,OwnsProperty_Yes,FamilyObligation_Yes,FundUseCase_Business,FundUseCase_Education,FundUseCase_Home,FundUseCase_Other,JointApplicant_Yes,RiskFlag
0,DRIRC89L0T,0,-1.699838,1.413785,1.151487,1.711544,-0.967182,-0.44953,-0.454811,1.41572,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,TS0FIUNHNU,1,0.23412,-0.649831,-1.715866,1.094714,-0.851727,-0.44953,0.939092,-0.000645,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,I0YR284A1V,2,-1.166333,0.04677,-0.458437,-0.762072,-1.515594,-0.44953,1.621727,-1.41701,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
3,WB1T7NQV8A,3,0.634249,-0.839783,1.440049,-0.258537,1.370784,0.445809,0.143437,1.41572,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,J6GU9M4G1Z,4,0.367496,0.845753,-1.488613,1.673779,-1.71764,1.341148,1.656386,-1.41701,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0


In [3]:
X = train.drop(["ProfileID", "RiskFlag"], axis=1)
y = train["RiskFlag"]

X_test = test.drop("ProfileID", axis=1)


In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_svm = np.zeros(len(train))
oof_lgb = np.zeros(len(train))
oof_xgb = np.zeros(len(train))

test_svm = np.zeros(len(test))
test_lgb = np.zeros(len(test))
test_xgb = np.zeros(len(test))


In [5]:
svm_model = Pipeline([
    ("scaler", RobustScaler()),
    ("feature_select", SelectFromModel(
        LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42)
    )),
    ("svm", LinearSVC(C=10, random_state=42))
])


In [6]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


In [7]:
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)


In [8]:
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"FOLD {fold+1}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    # ----- SVM -----
    svm_model.fit(X_tr, y_tr)
    oof_svm[valid_idx] = svm_model.decision_function(X_val)
    test_svm += svm_model.decision_function(X_test) / kf.n_splits

    # ----- LightGBM -----
    lgb_model.fit(X_tr, y_tr)
    oof_lgb[valid_idx] = lgb_model.predict_proba(X_val)[:,1]
    test_lgb += lgb_model.predict_proba(X_test)[:,1] / kf.n_splits

    # ----- XGBoost -----
    xgb_model.fit(X_tr, y_tr)
    oof_xgb[valid_idx] = xgb_model.predict_proba(X_val)[:,1]
    test_xgb += xgb_model.predict_proba(X_test)[:,1] / kf.n_splits


FOLD 1




[LightGBM] [Info] Number of positive: 18970, number of negative: 144451
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 163421, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116081 -> initscore=-2.030082
[LightGBM] [Info] Start training from score -2.030082
FOLD 2




[LightGBM] [Info] Number of positive: 19028, number of negative: 144393
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 163421, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116435 -> initscore=-2.026627
[LightGBM] [Info] Start training from score -2.026627
FOLD 3




[LightGBM] [Info] Number of positive: 19005, number of negative: 144417
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 163422, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116294 -> initscore=-2.028003
[LightGBM] [Info] Start training from score -2.028003
FOLD 4




[LightGBM] [Info] Number of positive: 18998, number of negative: 144424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 163422, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116251 -> initscore=-2.028420
[LightGBM] [Info] Start training from score -2.028420
FOLD 5




[LightGBM] [Info] Number of positive: 19011, number of negative: 144411
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004958 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 163422, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116331 -> initscore=-2.027646
[LightGBM] [Info] Start training from score -2.027646


In [9]:
print("OOF SVM AUC:", roc_auc_score(y, oof_svm))
print("OOF LGB AUC:", roc_auc_score(y, oof_lgb))
print("OOF XGB AUC:", roc_auc_score(y, oof_xgb))


OOF SVM AUC: 0.7462171070903251
OOF LGB AUC: 0.7495146099538672
OOF XGB AUC: 0.7503889606396836


In [10]:
stack_train = pd.DataFrame({
    "svm": oof_svm,
    "lgb": oof_lgb,
    "xgb": oof_xgb
})

stack_test = pd.DataFrame({
    "svm": test_svm,
    "lgb": test_lgb,
    "xgb": test_xgb
})


In [11]:
meta_model = LogisticRegression(max_iter=500)

meta_model.fit(stack_train, y)

meta_oof = meta_model.predict_proba(stack_train)[:,1]

print("META AUC:", roc_auc_score(y, meta_oof))


META AUC: 0.7514243943859067


In [12]:
final_preds = meta_model.predict_proba(stack_test)[:,1]
final_binary = (final_preds > 0.5).astype(int)


In [13]:
submission = pd.DataFrame({
    "ProfileID": test["ProfileID"],
    "RiskFlag": final_binary
})

submission.to_csv("svm_lgb_xgb_stacked.csv", index=False)

submission.head()


Unnamed: 0,ProfileID,RiskFlag
0,CKV34LU7V7,0
1,62KTYNH93J,0
2,JGFUSOIUH7,0
3,4538THBHOX,0
4,DXLNA06JHR,0
