# Imports

In [2]:
%%time

import pandas as pd 
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

!git clone https://github.com/muhammadabdullah0303/AbdML

import sys
sys.path.append('/kaggle/working/repository')

from AbdML.main import AbdBase
SEED = 42

fatal: destination path 'AbdML' already exists and is not an empty directory.
CPU times: user 2.52 s, sys: 470 ms, total: 2.99 s
Wall time: 2.7 s


# Load Data 

In [12]:
%%time

train = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
sample = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")
original = pd.read_csv("/kaggle/input/bank-marketing-dataset-full/bank-full.csv", sep=';')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

original['y'] = original['y'].map({'no': 0, 'yes': 1})

COLS = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome',]

%time

def NEW_FE(df):
    
    df['balance_log'] = np.log1p(df['balance'].clip(lower=0))
    df['job_edu'] = df['job'].astype(str) + "_" + df['education'].astype(str)
    df['contacted_before'] = (df['pdays'] != -1).astype(int)

    df['duration_sin'] = np.sin(2*np.pi * df['duration'] / 400)
    df['duration_cos'] = np.cos(2*np.pi * df['duration'] / 400)

    return df

train = NEW_FE(train)
test = NEW_FE(test)

cat_cols = ['job','marital', "education", 'contact', 'poutcome','month','default','housing','loan','job_edu']

mean = train['y'].mean() 

for c in COLS:
    new_col = f"{c}_mean_target_orig"
    train[new_col] = train[c].map(original.groupby(c)['y'].mean())
    train[new_col] = train[new_col].fillna(mean)
    test[new_col] = test[c].map(original.groupby(c)['y'].mean())
    test[new_col] = test[new_col].fillna(mean)

for c in COLS:
    mapping_count = original[c].value_counts()
    train[f"{c}_count"] = train[c].map(mapping_count).fillna(0)
    test[f"{c}_count"] = test[c].map(mapping_count).fillna(0)

def update(df):

    for col in cat_cols:
        df[col] = df[col].astype('category')
    return df

train = update(train)
test = update(test)

train.head()

CPU times: user 0 ns, sys: 4 µs, total: 4 µs
Wall time: 8.11 µs
CPU times: user 3.48 s, sys: 552 ms, total: 4.04 s
Wall time: 3.98 s


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,housing_count,loan_count,contact_count,day_count,month_count,duration_count,campaign_count,pdays_count,previous_count,poutcome_count
0,42,technician,married,secondary,no,7,no,no,cellular,25,...,20081,37967,29285,840,6247,150.0,5521.0,36954.0,36954.0,36959
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,...,20081,37967,13020,2308,5341,139.0,17544.0,36954.0,36954.0,36959
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,...,25130,37967,13020,1848,13766,166.0,12505.0,36954.0,36954.0,36959
3,27,student,single,secondary,no,34,yes,no,unknown,28,...,25130,37967,13020,1830,13766,76.0,12505.0,36954.0,36954.0,36959
4,26,technician,married,secondary,no,889,yes,no,cellular,3,...,25130,37967,29285,1079,2649,6.0,17544.0,36954.0,36954.0,36959


# Train Basic Model AbdBase

In [13]:
%%time

from sklearn.metrics import roc_auc_score

def ROC_AUC(y_true, y_pred_proba):
    return roc_auc_score(y_true, y_pred_proba)


cat_cols = ['job','marital', "education", 'contact', 'poutcome','month','default','housing','loan','job_edu']

encode_c = {'cat_c': cat_cols}

base = AbdBase(train_data=train, test_data=test, target_column='y',gpu=True, prob=True, test_prob=True,
                 problem_type="classification", metric="custom", seed=SEED,ohe_fe=False,ordinal_encoder=encode_c,
                 n_splits=5,early_stop=True,num_classes=2,cat_features=False,custom_metric=ROC_AUC,
                 fold_type='SKF')

[31m*** AbdBase ['V_1.3'] ***

[31m *** Available Settings *** 

[31mAvailable Models: [36mLGBM, [36mCAT, [36mXGB, [36mVoting, [36mTABNET, [36mRidge, [36mLR
[31mAvailable Metrics: [36mroc_auc, [36maccuracy, [36mf1, [36mprecision, [36mrecall, [36mrmse, [36mwmae, [36mrmsle, [36mmae, [36mr2, [36mmse, [36mmape, [36mcustom
[31mAvailable Problem Types: [36mclassification, [36mregression
[31mAvailable Fold Types: [36mSKF, [36mKF, [36mGKF, [36mGSKF, [36mRKF
[31m
 *** Configuration *** 

[31mProblem Type Selected: [36mCLASSIFICATION
[31mMetric Selected: [36mCUSTOM
[31mFold Type Selected: [36mSKF
[31mCalculate Train Probabilities: [36mTrue
[31mCalculate Test Probabilities: [36mTrue
[31mEarly Stopping: [36mTrue
[31mGPU: [36mTrue
[31mEval_Metric Selected is: [36mNone
[33m
---> Applying Ordinal Encoder

CPU times: user 1.83 s, sys: 110 ms, total: 1.94 s
Wall time: 1.94 s


## Lgbm

In [14]:
%%time

ParamsLgb = {'n_estimators': 40000, 'learning_rate': 0.0358306214515723, 'num_leaves': 228, 'max_depth': 6,
             'min_child_samples': 83, 'subsample': 0.8700304020753131, 'colsample_bytree': 0.6169349166144594,
             'reg_alpha': 3.700714656885025, 'reg_lambda': 4.709578317972932,"objective": "binary",
             "metric": "binary_logloss"}

results_Lgb_1 = base.Train_ML(ParamsLgb,'LGBM',e_stop=150)

Training Folds: 100%|██████████| 5/5 [30:20<00:00, 364.14s/it]

Overall Train ROC_AUC: 0.9850
Overall OOF ROC_AUC: 0.9743 
CPU times: user 1h 42s, sys: 16.1 s, total: 1h 58s
Wall time: 30min 20s





# Submission

In [15]:
%%time

def save_outputs(base_file_name, oof, pred):
    oof_df = pd.DataFrame(oof)
    pred_df = pd.DataFrame(pred)

    oof_df.to_csv(f"{base_file_name}_OOF.csv", index=False)
    pred_df.to_csv(f"{base_file_name}_PREDS.csv", index=False)

save_outputs('LGBM_0.9743',results_Lgb_1[0], results_Lgb_1[1])
mp = results_Lgb_1[1]

sample['y'] = mp
sample.to_csv('submission.csv', index=False)
sample.head()

CPU times: user 2.18 s, sys: 26 ms, total: 2.2 s
Wall time: 2.21 s


Unnamed: 0,id,y
0,750000,0.003739
1,750001,0.100906
2,750002,0.00011
3,750003,6.5e-05
4,750004,0.010847
