import and reading data

In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [10]:
train_data = pd.read_parquet('./train_data.parquet')
test_data = pd.read_parquet('./test_data.parquet')
train_data.head()

: 

: 

Preparing custom metrix

In [2]:
def metric(y_true : pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    gin = normalized_weighted_gini(y_true, y_pred)
    top = top_four_percent_captured(y_true, y_pred)
    return .5 * (gin + top)

Preprocessing 

In [3]:
features = test_data.columns.to_list()
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
cat_features

['B_30_last',
 'B_38_last',
 'D_114_last',
 'D_116_last',
 'D_117_last',
 'D_120_last',
 'D_126_last',
 'D_63_last',
 'D_64_last',
 'D_66_last',
 'D_68_last']

In [None]:
enc = LabelEncoder()
for feat in cat_features:
    train_data[feat] = enc.fit_transform(train_data[feat]) 
    test_data[feat] = enc.transform(test_data[feat])
train_data.dtypes

In [None]:
train_y = pd.DataFrame(train_data['target'])
train_x = train_data.drop('target', axis=1)

Model Training

In [None]:
no_folds = 5
skf = StratifiedKFold(n_splits=no_folds, shuffle=True, random_state=27)

y_preds = np.zeros(train_x.shape[0])
y_test = np.zeros(test_data.shape[0])

folds = 0
for feat_idx, label_idx in skf.split(train_x, train_y):
    print( folds + "Folds ***********************************************************")
    tr_x, tst_x = (train_x.iloc[feat_idx].reset_index(drop=True), train_x.iloc[label_idx].reset_index(drop=True))
    tr_y, tst_y = (train_y.iloc[feat_idx].reset_index(drop=True), train_y.iloc[label_idx].reset_index(drop=True))

    clf = CatBoostClassifier(iterations=5000, random_state=27)
    clf.fit(tr_x, tr_y, eval_set=[tst_x, tst_y], cat_features=cat_features, verbose=100)

    preds = clf.predict_proba(tst_x)[:, 1]
    y_preds[label_idx] = y_preds[label_idx] + preds

    preds_test = clf.predict_proba(test_data)[:,1]
    y_test = y_test + preds_test / no_folds

y_pred = train_y.copy(deep=True)
y_pred = y_pred.rename(columns={'target': 'prediction'})
y_pred['prediction'] = y_preds

score = metric(train_y, y_pred)
print("Score = " + score)

Submitting prediction

In [None]:
test_data["prediction"] = y_test
test_data["prediction"].to_csv(f"submission_1.csv", index=True)