In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from catboost import cv
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss 
import optuna

In [2]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")

In [3]:
features_train = train.drop(['Class', 'Id'], axis=1)
features_test = test.drop(['Id'], axis=1)

target_train = train['Class']

cat_features = ['EJ']

In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
def objective(trial):
    params = {
        'iterations': trial.suggest_categorical('iterations', [100, 200, 300]),
        'depth': trial.suggest_categorical('depth', [3, 6, 9]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
        # Add other hyperparameters you want to tune
    }

    model = CatBoostClassifier(random_state=42, cat_features=cat_features, loss_function='Logloss', **params)

    log_loss_scores = []
    for train_index, val_index in kf.split(features_train):
        train_data = features_train.iloc[train_index]
        val_data = features_train.iloc[val_index]
        train_target = target_train.iloc[train_index]
        val_target = target_train.iloc[val_index]

        train_pool = Pool(train_data, train_target, cat_features=cat_features)
        val_pool = Pool(val_data, val_target, cat_features=cat_features)

        model.fit(train_pool, eval_set=val_pool, verbose=False)

        val_pred = model.predict_proba(val_pool)[:, 1]
        loss = log_loss(val_target, val_pred)  # Use log_loss from sklearn.metrics
        log_loss_scores.append(loss)

    return np.mean(log_loss_scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

[32m[I 2023-07-16 06:27:10,015][0m A new study created in memory with name: no-name-423747f7-cce1-4a1e-a5b3-779b26e73014[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
[32m[I 2023-07-16 06:28:07,294][0m Trial 0 finished with value: 0.19426338918427427 and parameters: {'iterations': 300, 'depth': 9, 'learning_rate': 0.09190184179237369}. Best is trial 0 with value: 0.19426338918427427.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
[32m[I 2023-07-16 06:28:17,892][0m Trial 1 finished with value: 0.1794730536939321 and parameters: {'iterations': 300, 'depth': 6, 'learning_rate': 0.2103743318580228}. Best is trial 1 with value: 0.1794730536939321.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
[32m[I 2023-07-16 06:28:21,459][0m Trial 2 finished with value: 0.23707863851867103 and parameters: {'iterations': 100, 'depth': 6, 'learning_rate': 0.6809768152353521}. Best is trial 1 with value: 0.179473053

Best hyperparameters: {'iterations': 200, 'depth': 3, 'learning_rate': 0.12067157771715675}


In [6]:
# Создаем экземпляр модели CatBoostClassifier с лучшими гиперпараметрами
best_model = CatBoostClassifier(iterations=300, depth=3, learning_rate=0.11315511493034587, random_state=42,  cat_features=cat_features, loss_function='Logloss')

# Обучаем модель на тренировочной выборке
best_model.fit(features_train, target_train)

# Прогнозируем вероятности классов для тестовой выборки
predictions = best_model.predict_proba(features_test)

0:	learn: 0.6069174	total: 2.35ms	remaining: 701ms
1:	learn: 0.5381072	total: 4.95ms	remaining: 738ms
2:	learn: 0.4656503	total: 7.14ms	remaining: 707ms
3:	learn: 0.4276437	total: 9.07ms	remaining: 672ms
4:	learn: 0.3961551	total: 11ms	remaining: 649ms
5:	learn: 0.3721155	total: 13ms	remaining: 638ms
6:	learn: 0.3515515	total: 14.9ms	remaining: 623ms
7:	learn: 0.3295292	total: 16.8ms	remaining: 614ms
8:	learn: 0.3135774	total: 18.7ms	remaining: 604ms
9:	learn: 0.3000551	total: 20.8ms	remaining: 603ms
10:	learn: 0.2878035	total: 22.6ms	remaining: 594ms
11:	learn: 0.2698519	total: 24.6ms	remaining: 591ms
12:	learn: 0.2605086	total: 26.6ms	remaining: 588ms
13:	learn: 0.2514471	total: 28.5ms	remaining: 582ms
14:	learn: 0.2434234	total: 30.5ms	remaining: 580ms
15:	learn: 0.2368179	total: 32.7ms	remaining: 580ms
16:	learn: 0.2322463	total: 34.8ms	remaining: 580ms
17:	learn: 0.2274545	total: 37ms	remaining: 579ms
18:	learn: 0.2225047	total: 39.1ms	remaining: 579ms
19:	learn: 0.2178112	total: 

In [7]:
submission_df = pd.DataFrame({
    'Id': test['Id'],
    'class_0': predictions[:, 0],
    'class_1': predictions[:, 1]
})

In [8]:
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.752697,0.247303
1,010ebe33f668,0.752697,0.247303
2,02fa521e1838,0.752697,0.247303
3,040e15f562a2,0.752697,0.247303
4,046e85c7cc7f,0.752697,0.247303


In [9]:
submission_df.to_csv('submission.csv', index=False)