In [1]:
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from sklearn.base import clone
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import pickle
import gc

warnings.filterwarnings('ignore')

TARGET = 'Response'
N_FOLDS = 5
SEED = 1121
ES_ROUNDS = 150
ES_LOG_STEPS = 500

TRAIN_PATH = '/kaggle/input/playground-series-s4e7/train.csv'
TEST_PATH = '/kaggle/input/playground-series-s4e7/test.csv'
ORIGINAL_PATH = '/kaggle/input/health-insurance-cross-sell-prediction/train.csv'

class DataLoader:
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path

    @staticmethod
    def encode_categorical_features(dataframe):  
        print('--- Encoding categorical features')
        
        gender_mapping = {'Male': 0, 'Female': 1}
        vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
        vehicle_damage_mapping = {'No': 0, 'Yes': 1}
        
        dataframe['Gender'] = dataframe['Gender'].map(gender_mapping)
        dataframe['Vehicle_Age'] = dataframe['Vehicle_Age'].map(vehicle_age_mapping)
        dataframe['Vehicle_Damage'] = dataframe['Vehicle_Damage'].map(vehicle_damage_mapping)
        
        return dataframe
    
    @staticmethod
    def convert_data_types(dataframe):  
        print('--- Converting data types')
        
        dataframe['Region_Code'] = dataframe['Region_Code'].astype(int) 
        dataframe['Annual_Premium'] = dataframe['Annual_Premium'].astype(int) 
        dataframe['Policy_Sales_Channel'] = dataframe['Policy_Sales_Channel'].astype(int) 
        
        return dataframe
    
    @staticmethod
    def add_features(dataframe):  
        print("--- Adding new features")
        # Reference: https://www.kaggle.com/code/rohanrao/automl-grand-prix-1st-place-solution
        # Beware of data leakage
        dataframe['Previously_Insured_Annual_Premium'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Annual_Premium'].astype(str))[0]
        dataframe['Previously_Insured_Vehicle_Age'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Vehicle_Age'].astype(str))[0]
        dataframe['Previously_Insured_Vehicle_Damage'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Vehicle_Damage'].astype(str))[0]
        dataframe['Previously_Insured_Vintage'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Vintage'].astype(str))[0]
        
        return dataframe
    
    @staticmethod
    def reduce_mem_usage(dataframe):
        # Reference: https://www.kaggle.com/competitions/playground-series-s4e7/discussion/516103#2899151
        
        print('--- Reducing memory usage')
        initial_mem_usage = dataframe.memory_usage().sum() / 1024**2
        
        for col in dataframe.columns:
            col_type = dataframe[col].dtype

            if col_type.name in ['category', 'object']:
                raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataframe[col] = dataframe[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataframe[col] = dataframe[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataframe[col] = dataframe[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataframe[col] = dataframe[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dataframe[col] = dataframe[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataframe[col] = dataframe[col].astype(np.float32)
                else:
                    dataframe[col] = dataframe[col].astype(np.float64)

        final_mem_usage = dataframe.memory_usage().sum() / 1024**2
        print('------ Memory usage before: {:.2f} MB'.format(initial_mem_usage))
        print('------ Memory usage after: {:.2f} MB'.format(final_mem_usage))
        print('------ Decreased memory usage by {:.1f}%'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))

        return dataframe

    def load(self):
        print(f'Loading data')
        
        train = pd.read_csv(self.train_path, index_col='id')
        test = pd.read_csv(self.test_path, index_col='id')
        
        train = pd.concat([train, pd.read_csv(ORIGINAL_PATH, index_col='id')]).reset_index(drop=True) 
        train = train.drop_duplicates(keep="last").reset_index(drop=True)
        
        train['is_train'] = 1
        test['is_train'] = 0
        dataframe = pd.concat([train, test])
        del train, test
        gc.collect()
        
        dataframe = self.encode_categorical_features(dataframe)
        dataframe = self.convert_data_types(dataframe)
        dataframe = self.add_features(dataframe)
        dataframe = self.reduce_mem_usage(dataframe)
        
        train = dataframe[dataframe['is_train'] == 1].drop(columns=['is_train'])
        test = dataframe[dataframe['is_train'] == 0].drop(columns=['is_train', TARGET])
        
        del dataframe
        gc.collect()
        
        train[TARGET] = train[TARGET].astype(np.int8)
        
        return train, test



train, test = DataLoader(TRAIN_PATH, TEST_PATH).load()

X = train.drop(columns=TARGET)
y = train[TARGET]



class Trainer:
    def __init__(self, model, n_folds=N_FOLDS):
        self.model = model
        self.n_folds = n_folds

    def fit_predict(self, X, y, X_test):
        print(f'Training {self.model.__class__.__name__}\n')
        
        scores = []
        feature_importances = np.zeros(X.shape[1])
        
        oof_pred_probs = np.zeros((X.shape[0], len(np.unique(y))))
        test_pred_probs = np.zeros((X_test.shape[0], len(np.unique(y))))
        
        skf = StratifiedKFold(n_splits=self.n_folds, random_state=SEED, shuffle=True)
        for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
            val_pool = Pool(X_val, y_val, cat_features=X_val.columns.values)
            test_pool = Pool(X_test, cat_features=X_test.columns.values)
            
            model = clone(self.model)
            model.fit(
                X=train_pool, 
                eval_set=val_pool, 
                verbose=ES_LOG_STEPS, 
                early_stopping_rounds=ES_ROUNDS,
                use_best_model=True
            )
            
            feature_importances += model.feature_importances_ / self.n_folds
            
            y_pred_probs = model.predict_proba(val_pool)
            oof_pred_probs[val_idx] = y_pred_probs            
            
            temp_test_pred_probs = model.predict_proba(test_pool)
            test_pred_probs += temp_test_pred_probs / self.n_folds
            
            score = roc_auc_score(y_val, y_pred_probs[:, 1])
            scores.append(score)
            
            del model, X_train, y_train, X_val, y_val, y_pred_probs, temp_test_pred_probs, train_pool, val_pool, test_pool
            gc.collect()
            
            print(f'\n--- Fold {fold_idx + 1} - AUC: {score:.5f}\n\n')
            
        self._save_pred_probs(oof_pred_probs, np.mean(scores), 'oof')
        self._save_pred_probs(test_pred_probs, np.mean(scores), 'test')
        self._save_submission(test_pred_probs, np.mean(scores))
        
        gc.collect()
        
        print(f'------ Average AUC:      {np.mean(scores):.5f} ± {np.std(scores):.5f}\n\n')

        return oof_pred_probs, scores, feature_importances
        
    def _save_pred_probs(self, pred_probs, cv_score, name):
        model_name = self.model.__class__.__name__.lower().replace('classifier', '')
        with open(f'{model_name}_{name}_pred_probs_{cv_score:.5f}.pkl', 'wb') as f:
            pickle.dump(pred_probs, f)
    
    def _save_submission(self, test_pred_probs, score):
        name = self.model.__class__.__name__.lower().replace('classifier', '')
        sub = pd.read_csv('/kaggle/input/playground-series-s4e7/sample_submission.csv')
        sub['id'] = sub['id']
        sub[TARGET] = test_pred_probs[:, 1]
        # Reference: https://www.kaggle.com/code/paddykb/a-glitch-in-the-insurance-matrix
        INPUT_DIR = Path('/kaggle/input/playground-series-s4e7')
        train_data = pd.read_csv(INPUT_DIR / 'train.csv')
        test_data = pd.read_csv(INPUT_DIR / 'test.csv')
        orig_data = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
        features = sorted(set(test_data.columns) - set(['id']))
        train_data.merge(orig_data, on=features).filter(['Response_x', 'Response_y']).value_counts().reset_index()
        override_sub = test_data.merge(orig_data.drop(columns=['id']), on=features).assign(override=lambda x: np.where(x['Response'] == 0, 1, 0)).filter(['id', 'override']).groupby(['id'], as_index=False).agg(override=('override', 'mean'))
        sub.merge(override_sub, how='outer').assign(Response=lambda x: np.where(x['override'].isna(), x['Response'], x['override'])).filter(['id', 'Response']).to_csv('submission.csv', index=False)


model = CatBoostClassifier(
    # Reference: https://www.kaggle.com/code/ivanmitriakhin/5-fold-catboost-cv-0-89485-lb-0-89629
    loss_function='Logloss',
    eval_metric='AUC',
    class_names=[0, 1],
    learning_rate=0.065,
    iterations=3000,
    depth=9,
    random_strength=0,
    l2_leaf_reg=400,
    fold_permutation_block=64,
    task_type='GPU',
    random_seed=SEED,
    verbose=False
)
trainer = Trainer(model)
oof_pred_probs, scores, feature_importances = trainer.fit_predict(X, y, test)


Loading data
--- Encoding categorical features
--- Converting data types
--- Adding new features
--- Reducing memory usage
------ Memory usage before: 2536.34 MB
------ Memory usage after: 634.09 MB
------ Decreased memory usage by 75.0%
Training CatBoostClassifier



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8742431	best: 0.8742431 (0)	total: 9.44s	remaining: 7h 51m 49s
500:	test: 0.8937659	best: 0.8937659 (500)	total: 9m 5s	remaining: 45m 23s
1000:	test: 0.8942430	best: 0.8942430 (999)	total: 17m 52s	remaining: 35m 41s
1500:	test: 0.8944579	best: 0.8944579 (1500)	total: 26m 38s	remaining: 26m 36s
2000:	test: 0.8945618	best: 0.8945621 (1997)	total: 35m 30s	remaining: 17m 43s
2500:	test: 0.8946256	best: 0.8946257 (2497)	total: 44m 22s	remaining: 8m 51s
2999:	test: 0.8946691	best: 0.8946697 (2994)	total: 53m 5s	remaining: 0us
bestTest = 0.8946697116
bestIteration = 2994
Shrink model to first 2995 iterations.

--- Fold 1 - AUC: 0.89467




Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8747098	best: 0.8747098 (0)	total: 986ms	remaining: 49m 16s
500:	test: 0.8941804	best: 0.8941804 (500)	total: 8m 53s	remaining: 44m 20s
1000:	test: 0.8946933	best: 0.8946934 (998)	total: 17m 41s	remaining: 35m 20s
1500:	test: 0.8948730	best: 0.8948734 (1495)	total: 26m 30s	remaining: 26m 28s
2000:	test: 0.8949847	best: 0.8949856 (1996)	total: 35m 12s	remaining: 17m 34s
2500:	test: 0.8950497	best: 0.8950499 (2498)	total: 43m 59s	remaining: 8m 46s
2999:	test: 0.8950940	best: 0.8950940 (2996)	total: 52m 47s	remaining: 0us
bestTest = 0.8950940371
bestIteration = 2996
Shrink model to first 2997 iterations.

--- Fold 2 - AUC: 0.89509




Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8749239	best: 0.8749239 (0)	total: 901ms	remaining: 45m
500:	test: 0.8938525	best: 0.8938525 (500)	total: 8m 54s	remaining: 44m 25s
1000:	test: 0.8943695	best: 0.8943695 (1000)	total: 17m 45s	remaining: 35m 28s
1500:	test: 0.8945681	best: 0.8945681 (1499)	total: 26m 31s	remaining: 26m 29s
2000:	test: 0.8946934	best: 0.8946936 (1999)	total: 35m 15s	remaining: 17m 36s
2500:	test: 0.8947444	best: 0.8947444 (2500)	total: 44m 12s	remaining: 8m 49s
2999:	test: 0.8947973	best: 0.8947979 (2994)	total: 53m 2s	remaining: 0us
bestTest = 0.8947978914
bestIteration = 2994
Shrink model to first 2995 iterations.

--- Fold 3 - AUC: 0.89480




Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8753347	best: 0.8753347 (0)	total: 904ms	remaining: 45m 10s
500:	test: 0.8941820	best: 0.8941820 (500)	total: 8m 57s	remaining: 44m 41s
1000:	test: 0.8947033	best: 0.8947033 (1000)	total: 17m 48s	remaining: 35m 34s
1500:	test: 0.8949088	best: 0.8949092 (1496)	total: 26m 46s	remaining: 26m 44s
2000:	test: 0.8950149	best: 0.8950149 (2000)	total: 35m 44s	remaining: 17m 50s
2500:	test: 0.8950921	best: 0.8950926 (2498)	total: 44m 40s	remaining: 8m 54s
2999:	test: 0.8951360	best: 0.8951362 (2998)	total: 53m 31s	remaining: 0us
bestTest = 0.8951361775
bestIteration = 2998
Shrink model to first 2999 iterations.

--- Fold 4 - AUC: 0.89514




Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8746933	best: 0.8746933 (0)	total: 903ms	remaining: 45m 7s
500:	test: 0.8940042	best: 0.8940042 (500)	total: 8m 56s	remaining: 44m 34s
1000:	test: 0.8945203	best: 0.8945203 (1000)	total: 17m 46s	remaining: 35m 29s
1500:	test: 0.8947374	best: 0.8947374 (1498)	total: 26m 37s	remaining: 26m 34s
2000:	test: 0.8948361	best: 0.8948361 (2000)	total: 35m 24s	remaining: 17m 40s
2500:	test: 0.8949143	best: 0.8949146 (2493)	total: 44m 15s	remaining: 8m 49s
2999:	test: 0.8949652	best: 0.8949654 (2998)	total: 52m 59s	remaining: 0us
bestTest = 0.8949653506
bestIteration = 2998
Shrink model to first 2999 iterations.

--- Fold 5 - AUC: 0.89497


------ Average AUC:      0.89493 ± 0.00018


