In [None]:
!pip install catboost[gpu]

## catboost (blender)

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e7/sample_submission.csv
/kaggle/input/playground-series-s4e7/train.csv
/kaggle/input/playground-series-s4e7/test.csv


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc

warnings.filterwarnings("ignore")

from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay, classification_report


import catboost as cb

In [4]:
train = pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv", index_col = "id", engine="pyarrow")
test = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv", index_col = "id", engine="pyarrow")

In [5]:
train["Region_Code"] = train["Region_Code"].astype(np.int8)
test["Region_Code"] = test["Region_Code"].astype(np.int8)

train["Policy_Sales_Channel"] = train["Policy_Sales_Channel"].astype(np.int16)
test["Policy_Sales_Channel"] = test["Policy_Sales_Channel"].astype(np.int16)

In [6]:
target="Response"

In [7]:
X = train.drop(target, axis=1)
y = train[target]

In [8]:
skfold = StratifiedKFold(2, shuffle=True, random_state=42)

In [9]:
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.1, # 0.1, 0.01
    'iterations': 3000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}

In [None]:
oof_preds = []
oof_aucs = []

test_pool = cb.Pool(test.astype(str), cat_features=X.columns.values)

for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y[train_idx]
    X_test, y_test = X.iloc[test_idx], y[test_idx]
    
    X_train_pool = cb.Pool(X_train.astype(str), y_train, cat_features=X.columns.values)
    X_test_pool = cb.Pool(X_test.astype(str), y_test, cat_features=X.columns.values)
    
    cat_clf = cb.CatBoostClassifier(**cat_params)
    cat_clf = cat_clf.fit(X=X_train_pool,
                          eval_set=X_test_pool,
                          verbose=500,
                          early_stopping_rounds=200)
    
    test_pred = cat_clf.predict_proba(test_pool)[:, 1]
    
    oof_preds.append(test_pred)
    auc = cat_clf.best_score_['validation']['AUC']
    oof_aucs.append(auc)
    print(f"\n---- Fold {fold}: ROC-AUC Score: {auc:.6f}\n")
    
    del X_train, y_train, X_test, y_test
    del X_train_pool, X_test_pool
    del cat_clf
    gc.collect()

auc_mean = np.mean(oof_aucs)
auc_std = np.std(oof_aucs)
print(f"\n---> ROC-AUC Score: {auc_mean:.6f} \xB1 {auc_std:.6f}\n")

test_pred_cat = np.mean(oof_preds, axis=0)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8747459	best: 0.8747459 (0)	total: 2.19s	remaining: 1h 49m 30s
500:	test: 0.8941752	best: 0.8941752 (500)	total: 12m 51s	remaining: 1h 4m 10s
1000:	test: 0.8944159	best: 0.8944190 (990)	total: 24m 32s	remaining: 49m
1500:	test: 0.8944703	best: 0.8944731 (1481)	total: 36m 8s	remaining: 36m 5s
bestTest = 0.8944749832
bestIteration = 1527
Shrink model to first 1528 iterations.

---- Fold 0: ROC-AUC Score: 0.894475



Default metric period is 5 because AUC is/are not implemented for GPU


In [None]:
sub = pd.DataFrame({
    'id': test.index,
    'Response': test_pred_cat
})

In [None]:
sub.to_csv("submission.csv", index=False)

## blender

In [10]:
sub1 = pd.read_csv('/kaggle/input/submission-set/submission_mine.csv', 
                   engine = "pyarrow")[target].ravel()
sub2 = pd.read_csv('/kaggle/input/submission-set/submission_1.csv',
                   engine = "pyarrow")[target].ravel()
sub3 = pd.read_csv('/kaggle/input/submission-set/submission_2.csv',
                   engine = "pyarrow")[target].ravel()
sub4 = pd.read_parquet('/kaggle/input/submission-set/submission.parquet')[target].ravel()

In [11]:
sub = pd.DataFrame({
    'id' : test.index,
    'Response' : np.average([sub1, sub2, sub3, sub4], axis = 0, 
                            weights = [1, 15, 10, 20])
})

In [13]:
sub.head()

Unnamed: 0,id,Response
0,11504798,0.004537
1,11504799,0.660818
2,11504800,0.240221
3,11504801,0.000323
4,11504802,0.174725


In [14]:
submission = pd.read_csv('/kaggle/working/submission.csv')

In [15]:
submission.head()

Unnamed: 0,id,Response
0,11504798,0.004537
1,11504799,0.660818
2,11504800,0.240221
3,11504801,0.000323
4,11504802,0.174725


In [12]:
sub.to_csv("submission.csv", index = False)