In [None]:
#thanks to @ambrosm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats as stats
import warnings
from colorama import Fore, Back, Style
import gc
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay

plt.rcParams['axes.facecolor'] = '#e8e3e3'
plt.rcParams['axes.prop_cycle'] = cycler(color=['#0c7bdc'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][3:])
plt.rcParams['text.color'] = 'b'

INFERENCE = False # set to False if you only want to cross-validate

In [3]:
# @yunchonggan's fast metric implementation
# From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

important_feat = ["target",'D_46_last', 'D_43_avg', 'D_43_last', 'S_3_last', 'D_46_avg', 'D_46_min', 'B_3_last', 'D_48_last', 'P_3_avg', 'B_5_last', 'P_3_last', 'S_3_avg', 'D_43_max', 'B_4_max', 'S_7_min', 'S_9_avg', 'B_4_last', 'B_17_last', 'D_46_max', 'P_3_max', 'S_3_max', 'c_PR_21', 'B_17_min', 'D_61_last', 'S_7_last', 'D_62_min', 'P_2_min', 'B_2_last', 'c_DP_239', 'R_27_min', 'S_9_min', 'B_1_last', 'D_47_avg', 'c_DP_348', 'S_12_avg', 'B_17_max', 'S_23_max', 'D_77_max', 'D_105_max', 'S_9_last', 'R_27_max', 'D_77_avg', 'S_27_max', 'D_62_last', 'S_7_max', 'R_27_avg', 'S_3_min', 'P_3_min', 'D_48_min', 'P_2_avg', 'S_12_min', 'D_121_avg', 'R_27_last', 'S_27_last', 'B_3_max', 'c_PB_29', 'D_61_max', 'S_26_last', 'S_25_max', 'P_2_last', 'D_47_max', 'D_69_avg', 'c_DP_355', 'S_5_last', 'B_15_last', 'S_19_last', 'B_24_last', 'D_48_max', 'B_17_avg', 'S_11_avg', 'B_11_last', 'S_25_last', 'D_52_last', 'B_10_last', 'D_39_max', 'S_23_avg', 'B_5_min', 'a_DP_239', 'B_9_avg', 'D_69_last', 'S_26_max', 'D_61_avg', 'R_1_last', 'P_2_max', 'S_7_avg', 'D_62_avg', 'D_47_last', 'B_40_max', 'R_1_avg', 'D_121_max', 'D_119_min', 'S_12_last', 'B_14_last', 'D_121_last', 'S_25_min', 'S_15_avg', 'B_9_max', 'B_37_last', 'R_3_avg', 'D_102_max', 'B_2_min', 'S_23_last', 'B_28_min', 'D_133_max', 'R_6_last', 'D_118_min', 'B_9_last', 'S_16_last', 'S_16_max', 'D_133_avg', 'B_5_max', 'B_21_avg', 'D_60_last', 'B_26_last', 'S_12_max', 'D_48_avg', 'D_52_max', 'S_23_min', 'S_24_last', 'D_42_avg', 'D_121_min', 'B_7_last', 'D_58_max', 'D_71_last', 'B_36_last', 'S_5_max', 'B_21_max', 'S_22_last', 'B_18_last', 'D_133_last', 'D_144_max', 'S_5_avg', 'S_5_min', 'R_1_max', 'D_144_avg', 'D_42_min', 'B_15_min', 'B_24_max', 'D_59_avg', 'R_1_min', 'B_12_avg', 'D_55_min', 'D_105_last', 'D_144_min', 'S_26_avg', 'S_25_avg', 'D_60_max', 'D_115_min', 'S_16_avg', 'D_42_max', 'D_56_min', 'B_2_avg', 'B_8_last', 'S_22_avg', 'B_8_avg', 'B_5_avg', 'D_45_avg', 'D_45_last', 'B_15_max', 'D_119_avg', 'B_21_last']

In [4]:
train = pd.read_csv("../data/xgb_preprocessed/train_with_target.csv", usecols=important_feat[:21])
test = pd.read_csv("../data/xgb_preprocessed/test_data.csv", usecols=important_feat[1:21]+["customer_ID"])

In [5]:
target = train["target"]
test_IDS = test["customer_ID"]

In [6]:
train =train.drop("target",axis=1)
test = test.drop("customer_ID",axis=1)

In [7]:
train.shape

(458913, 30)

In [8]:
from autoimpute.imputations import MiceImputer, SingleImputer

mice = SingleImputer()



In [9]:
gc.collect()

0

In [10]:
for col in train.columns:
    mean = train[col].mean()
    train.loc[train[col].isna(),col] = mean
    test.loc[test[col].isna(),col] = mean

In [None]:
from sklearn.svm import NuSVC

score_list = []
y_pred_list = []

kf = StratifiedKFold(n_splits=5)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):
    X_tr, X_va, y_tr, y_va, model = None, None, None, None, None
    start_time = datetime.datetime.now()
    X_tr = train.iloc[idx_tr]
    X_va = train.iloc[idx_va]
    y_tr = target[idx_tr]
    y_va = target[idx_va]

    model = NuSVC()
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, y_tr)
    X_tr, y_tr = None, None
    y_va_pred = model.predict_proba(X_va)
    print(y_va.shape,np.argmax(y_va_pred,axis=1).shape)
    score = amex_metric(y_va, np.argmax(y_va_pred,axis=1))
    print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f"                Score = {score:.5f}{Style.RESET_ALL}")
    score_list.append(score)

    y_pred_list.append(model.predict_proba(test))

print(f"{Fore.GREEN}{Style.BRIGHT}OOF Score:                       {np.mean(score_list):.5f}{Style.RESET_ALL}")

In [None]:
test_IDS

In [None]:
for i in range(5):
    print(y_pred_list[i].shape)

In [None]:
pred_df = pd.DataFrame(columns=["customer_ID","fold1","fold2","fold3","fold4","fold5"])
pred_df["fold1"] = y_pred_list[0]
pred_df["fold2"] = y_pred_list[1]
pred_df["fold3"] = y_pred_list[2]
pred_df["fold4"] = y_pred_list[3]
pred_df["fold5"] = y_pred_list[4]
pred_df["customer_ID"] = test_IDS

In [None]:
pred_df.to_csv("svm_submission.csv",index=False)