In [1]:
# Cell 1: Imports

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


In [2]:
# Cell 2: Load the processed datasets

train = pd.read_csv("/kaggle/input/processed-financial-risk/train_processed.csv")
test  = pd.read_csv("/kaggle/input/processed-financial-risk/test_processed.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()


Train shape: (204277, 27)
Test shape: (51070, 26)


Unnamed: 0,ProfileID,orig_index,ApplicantYears,AnnualEarnings,RequestedSum,TrustMetric,WorkDuration,ActiveAccounts,OfferRate,RepayPeriod,...,RelationshipStatus_Married,RelationshipStatus_Single,OwnsProperty_Yes,FamilyObligation_Yes,FundUseCase_Business,FundUseCase_Education,FundUseCase_Home,FundUseCase_Other,JointApplicant_Yes,RiskFlag
0,DRIRC89L0T,0,-1.699838,1.413785,1.151487,1.711544,-0.967182,-0.44953,-0.454811,1.41572,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,TS0FIUNHNU,1,0.23412,-0.649831,-1.715866,1.094714,-0.851727,-0.44953,0.939092,-0.000645,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,I0YR284A1V,2,-1.166333,0.04677,-0.458437,-0.762072,-1.515594,-0.44953,1.621727,-1.41701,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
3,WB1T7NQV8A,3,0.634249,-0.839783,1.440049,-0.258537,1.370784,0.445809,0.143437,1.41572,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,J6GU9M4G1Z,4,0.367496,0.845753,-1.488613,1.673779,-1.71764,1.341148,1.656386,-1.41701,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0


In [3]:
# Cell 3: Prepare features (remove ProfileID)

X = train.drop(["RiskFlag", "ProfileID"], axis=1)
y = train["RiskFlag"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_valid.shape


((163421, 25), (40856, 25))

In [4]:
# Cell 4: Build pipeline with RobustScaler + L1 feature selection + LinearSVC

pipeline = Pipeline([
    ("scaler", RobustScaler()),

    # L1-based feature selection (sparse model)
    ("feature_select", SelectFromModel(
        LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42)
    )),

    # Final SVM classifier
    ("svm", LinearSVC(random_state=42))
])


In [5]:
# Cell 5: Extended hyperparameter tuning

param_grid = {
    "svm__C": [0.01, 0.1, 1, 10],
    "svm__loss": ["squared_hinge"],
    "svm__dual": [True, False]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV ROC AUC:", grid.best_score_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best parameters: {'svm__C': 10, 'svm__dual': True, 'svm__loss': 'squared_hinge'}
Best CV ROC AUC: 0.746306153347404




In [6]:
# Cell 6: Validation ROC-AUC

best_model = grid.best_estimator_

valid_scores = best_model.decision_function(X_valid)
auc = roc_auc_score(y_valid, valid_scores)

print("Validation ROC-AUC:", auc)


Validation ROC-AUC: 0.7462601405411162


In [7]:
# Cell 7: Train final SVM on all data

final_model = grid.best_estimator_
final_model.fit(X, y)




In [8]:
# Cell 8: Generate predictions (0/1)

test_features = test.drop("ProfileID", axis=1)

test_scores = final_model.decision_function(test_features)
test_preds = (test_scores > 0).astype(int)   # convert to 0/1


In [9]:
# Cell 9: Create final submission CSV

submission = pd.DataFrame({
    "ProfileID": test["ProfileID"],
    "RiskFlag": test_preds
})

submission_path = "svm_upgraded_submission.csv"
submission.to_csv(submission_path, index=False)

submission.head(), submission_path


(    ProfileID  RiskFlag
 0  CKV34LU7V7         0
 1  62KTYNH93J         0
 2  JGFUSOIUH7         0
 3  4538THBHOX         0
 4  DXLNA06JHR         0,
 'svm_upgraded_submission.csv')

In [10]:
# # Cell 10: save model for reuse
# joblib.dump(final_model, "svm_model.pkl")
# print("Model saved to svm_model.pkl")
