In [181]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns   
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
from optuna.pruners import MedianPruner
from catboost import Pool
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold


In [182]:
#%pip install -U category_encoders
#%pip install optuna

For this competition, there are two target variables:

- h1n1_vaccine - Whether respondent received H1N1 flu vaccine.

- seasonal_vaccine - Whether respondent received seasonal flu vaccine.

Both are binary variables: 0 = No; 1 = Yes. Some respondents didn't get either vaccine, others got only one, and some got both. This is formulated as a multilabel (and not multiclass) problem

The features in this dataset

You are provided a dataset with 36 columns. The first column respondent_id is a unique and random identifier. The remaining 35 features are described below.

For all binary variables: 0 = No; 1 = Yes.

- h1n1_concern - Level of concern about the H1N1 flu.
0 = Not at all concerned; 
1 = Not very concerned; 
2 = Somewhat concerned; 
3 = Very concerned.

- h1n1_knowledge - Level of knowledge about H1N1 flu.
0 = No knowledge; 1 = A little knowledge; 2 = A lot of knowledge.

- behavioral_antiviral_meds - Has taken antiviral medications. (binary)

- behavioral_avoidance - Has avoided close contact with others with flu-like symptoms. (binary)

- behavioral_face_mask - Has bought a face mask. (binary)

- behavioral_wash_hands - Has frequently washed hands or used hand sanitizer. (binary)

- behavioral_large_gatherings - Has reduced time at large gatherings. (binary)

- behavioral_outside_home - Has reduced contact with people outside of own household. (binary)

- behavioral_touch_face - Has avoided touching eyes, nose, or mouth. (binary)

- doctor_recc_h1n1 - H1N1 flu vaccine was recommended by doctor. (binary)

- doctor_recc_seasonal - Seasonal flu vaccine was recommended by doctor. (binary)

- chronic_med_condition - Has any of the following chronic medical conditions: asthma or an other lung condition, diabetes, a heart condition, a kidney condition, sickle cell anemia or other anemia, a neurological or neuromuscular condition, a liver condition, or a weakened immune system caused by a chronic illness or by medicines taken for a chronic illness. (binary)

- child_under_6_months - Has regular close contact with a child under the age of six months. (binary)
health_worker - Is a healthcare worker. (binary)
health_insurance - Has health insurance. (binary)
opinion_h1n1_vacc_effective - Respondent's opinion about H1N1 vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.

- opinion_h1n1_risk - Respondent's opinion about risk of getting sick with H1N1 flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.

- opinion_h1n1_sick_from_vacc - Respondent's worry of getting sick from taking H1N1 vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.

- opinion_seas_vacc_effective - Respondent's opinion about seasonal flu vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.

- opinion_seas_risk - Respondent's opinion about risk of getting sick with seasonal flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.

- opinion_seas_sick_from_vacc - Respondent's worry of getting sick from taking seasonal flu vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.

- age_group - Age group of respondent.

- education - Self-reported education level.

- race - Race of respondent.

- sex - Sex of respondent.

- income_poverty - Household annual income of respondent with respect to 2008 Census poverty thresholds.

- marital_status - Marital status of respondent.

- rent_or_own - Housing situation of respondent.

- employment_status - Employment status of respondent.

- hhs_geo_region - Respondent's residence using a 10-region geographic classification defined by the U.S. Dept. of Health and Human Services. Values are represented as short random character strings.
census_msa - Respondent's residence within metropolitan statistical areas (MSA) as defined by the U.S. Census.

- household_adults - Number of other adults in household, top-coded to 3.

- household_children - Number of children in household, top-coded to 3.

- employment_industry - Type of industry respondent is employed in. Values are represented as short random character strings.

- employment_occupation - Type of occupation of respondent. Values are represented as short random character strings.



In [183]:
submission_format = pd.read_csv('data/submission_format.csv')  
submission_format.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7


submission_format в целом для нас как я понял это бесполезный файл, с ним работать не прийдется. Это файл пример для сдачи как я понял

In [184]:
training_set_features = pd.read_csv('data/training_set_features.csv')  
training_set_labels = pd.read_csv('data/training_set_labels.csv')  
print(len(training_set_features), len(training_set_labels))
len_tr_set = len(training_set_features)

26707 26707


In [185]:
training_set_features.head(3)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo


In [186]:
training_set_labels.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0


In [187]:
training_set_labels.isna().sum()

respondent_id       0
h1n1_vaccine        0
seasonal_vaccine    0
dtype: int64

In [188]:
training_set_features.isna().sum()

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

явно видно что у нас большое количество фичей с Nan значениеми, так что надо от них избавиться

In [189]:
for i in training_set_features.columns:
    t = training_set_features[i].isna().sum() / len_tr_set
    if t < 0.03: 
        print(f"{i} ---- 1 ---- {training_set_features[i].isna().sum() / len_tr_set:.3f}%")
    elif t < 0.1:
        print(f"{i} ---- 2 ---- {training_set_features[i].isna().sum() / len_tr_set:.3f}%")
    else:
        print(f"{i} ---- 3 ---- {training_set_features[i].isna().sum() / len_tr_set:.3f}% !!!!!!!!!!!!")

respondent_id ---- 1 ---- 0.000%
h1n1_concern ---- 1 ---- 0.003%
h1n1_knowledge ---- 1 ---- 0.004%
behavioral_antiviral_meds ---- 1 ---- 0.003%
behavioral_avoidance ---- 1 ---- 0.008%
behavioral_face_mask ---- 1 ---- 0.001%
behavioral_wash_hands ---- 1 ---- 0.002%
behavioral_large_gatherings ---- 1 ---- 0.003%
behavioral_outside_home ---- 1 ---- 0.003%
behavioral_touch_face ---- 1 ---- 0.005%
doctor_recc_h1n1 ---- 2 ---- 0.081%
doctor_recc_seasonal ---- 2 ---- 0.081%
chronic_med_condition ---- 2 ---- 0.036%
child_under_6_months ---- 2 ---- 0.031%
health_worker ---- 2 ---- 0.030%
health_insurance ---- 3 ---- 0.460% !!!!!!!!!!!!
opinion_h1n1_vacc_effective ---- 1 ---- 0.015%
opinion_h1n1_risk ---- 1 ---- 0.015%
opinion_h1n1_sick_from_vacc ---- 1 ---- 0.015%
opinion_seas_vacc_effective ---- 1 ---- 0.017%
opinion_seas_risk ---- 1 ---- 0.019%
opinion_seas_sick_from_vacc ---- 1 ---- 0.020%
age_group ---- 1 ---- 0.000%
education ---- 2 ---- 0.053%
race ---- 1 ---- 0.000%
sex ---- 1 ---- 0.000

employment_occupation и employment_industry, health_insurance  тк у них +- 50% Nan я удаляю

In [190]:
df = pd.merge(training_set_features, training_set_labels, on='respondent_id', how='inner')
df.head(5)


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [191]:
#удалил столбцы с самыми большими пропусками
#df = df.drop(columns=['respondent_id', 'employment_occupation', 'employment_industry', 'health_insurance'])
df.head(3)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0


In [192]:
df['employment_industry'].value_counts()

employment_industry
fcxhlnwr    2468
wxleyezf    1804
ldnlellj    1231
pxcmvdjn    1037
atmlpfrs     926
arjwrbjb     871
xicduogh     851
mfikgejo     614
vjjrobsf     527
rucpziij     523
xqicxuve     511
saaquncn     338
cfqqtusy     325
nduyfdeo     286
mcubkhph     275
wlfvacwt     215
dotnnunm     201
haxffmxo     148
msuufmds     124
phxvnwax      89
qnlwzans      13
Name: count, dtype: int64

In [193]:
df.isna().sum()

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [194]:
target_col = ['h1n1_vaccine', 'seasonal_vaccine']  # или 'seasonal_vaccine'

# новый признак "обе рекомендации врача"
df['doctor_recc_both'] = ((df['doctor_recc_h1n1']==1) & (df['doctor_recc_seasonal']==1)).astype('float')# это bool так что занесу в bool 
# новый признак "знание × беспокойство"
df['knowledge_x_concern'] = df['h1n1_knowledge'] * df['h1n1_concern'] # это получается у нас float 

#разделил на категории
numerical_cols = ['knowledge_x_concern', 'household_adults', 'household_children']
ordinal_cols = [
    'h1n1_concern','h1n1_knowledge',
    'opinion_h1n1_risk','opinion_h1n1_vacc_effective',
    'opinion_seas_risk','opinion_seas_vacc_effective',
    'opinion_h1n1_sick_from_vacc','opinion_seas_sick_from_vacc'
]
bool_cols = [
    'behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask',
    'behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
    'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal',
    'chronic_med_condition','child_under_6_months','health_worker','health_insurance','doctor_recc_both'
]
cat_nominal = [
    'age_group','education','race','sex','income_poverty',
    'marital_status','rent_or_own','employment_status', 
    'hhs_geo_region','census_msa','employment_occupation', 'employment_industry'
]

In [195]:
df_catboost= df.copy()
df_catboost.isna().sum()

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [196]:
import pandas as pd

CAT_MISSING_TOKEN = "__NA__"

def prepare_for_catboost(df_catboost: pd.DataFrame,
                         ordinal_cols,   # порядковые: оставляем числовыми
                         cat_cls,        # номинальные категориальные
                         bool_clc=None   # булевые как категориальные
                         ):
    df = df_catboost.copy()
    bool_clc = list(bool_clc) if bool_clc is not None else []

    # 1) Список категориальных = номинальные + булевые (без порядковых)
    cat_features = sorted(set([c for c in cat_cls if c in df.columns] +
                              [c for c in bool_clc if c in df.columns]))
    # Удалим порядковые из категориальных, если вдруг пересеклись
    cat_features = [c for c in cat_features if c not in set(ordinal_cols)]

    # 2) Обработка только КАТЕГОРИАЛЬНЫХ: к строке, нормализация пробелов, NaN/пустые -> "__NA__"
    for c in cat_features:
        s = df[c].astype("string")
        s = s.str.replace(r"\s+", " ", regex=True).str.strip()
        s = s.replace({"": CAT_MISSING_TOKEN, "NA": CAT_MISSING_TOKEN,
                       "NaN": CAT_MISSING_TOKEN, "nan": CAT_MISSING_TOKEN})
        df[c] = s.fillna(CAT_MISSING_TOKEN)

    # 3) Порядковые и числовые НЕ трогаем: остаются float/int с реальными NaN

    return df, cat_features



Как я понял можно хранитьь числа с нан и ничего плохого не будет 

In [197]:
df_catboost, cat_features = prepare_for_catboost(df_catboost,ordinal_cols, cat_nominal,bool_cols)
df_catboost.isna().sum()

respondent_id                    0
h1n1_concern                    92
h1n1_knowledge                 116
behavioral_antiviral_meds        0
behavioral_avoidance             0
behavioral_face_mask             0
behavioral_wash_hands            0
behavioral_large_gatherings      0
behavioral_outside_home          0
behavioral_touch_face            0
doctor_recc_h1n1                 0
doctor_recc_seasonal             0
chronic_med_condition            0
child_under_6_months             0
health_worker                    0
health_insurance                 0
opinion_h1n1_vacc_effective    391
opinion_h1n1_risk              388
opinion_h1n1_sick_from_vacc    395
opinion_seas_vacc_effective    462
opinion_seas_risk              514
opinion_seas_sick_from_vacc    537
age_group                        0
education                        0
race                             0
sex                              0
income_poverty                   0
marital_status                   0
rent_or_own         

In [198]:
y_h1n1_vaccine= df_catboost['h1n1_vaccine'].astype(int)
y_seasonal_vaccine = df_catboost['seasonal_vaccine'].astype(int)
df_catboost = df_catboost.drop(columns=['respondent_id', 'h1n1_vaccine','seasonal_vaccine'])
X = df_catboost

In [199]:
X

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,doctor_recc_both,knowledge_x_concern
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,__NA__,__NA__,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0.0,6.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,__NA__,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0.0,1.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,__NA__,__NA__,0.0,1.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,__NA__,__NA__,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,1.0,2.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,Own,__NA__,lzgpxyit,"MSA, Not Principle City",0.0,0.0,__NA__,__NA__,0.0,4.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,__NA__,0.0,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0.0,1.0


In [200]:
X.isna().sum()

h1n1_concern                    92
h1n1_knowledge                 116
behavioral_antiviral_meds        0
behavioral_avoidance             0
behavioral_face_mask             0
behavioral_wash_hands            0
behavioral_large_gatherings      0
behavioral_outside_home          0
behavioral_touch_face            0
doctor_recc_h1n1                 0
doctor_recc_seasonal             0
chronic_med_condition            0
child_under_6_months             0
health_worker                    0
health_insurance                 0
opinion_h1n1_vacc_effective    391
opinion_h1n1_risk              388
opinion_h1n1_sick_from_vacc    395
opinion_seas_vacc_effective    462
opinion_seas_risk              514
opinion_seas_sick_from_vacc    537
age_group                        0
education                        0
race                             0
sex                              0
income_poverty                   0
marital_status                   0
rent_or_own                      0
employment_status   

In [201]:
#надо сделать два набора данных для катбуста и хдбуста

In [202]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve

# Индексы категориальных колонок для CatBoost
cat_features = [X.columns.get_loc(c) for c in cat_nominal if c in X.columns]

# Стратифицированная K-fold валидация
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=314)


In [205]:

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=314)
pruner = MedianPruner(n_warmup_steps=3)
cat_features = cat_nominal +bool_cols
def objective(trial):
    params = {
        'loss_function':'Logloss','eval_metric':'AUC',
        'task_type':'GPU','devices':'0','bootstrap_type':'Bayesian',
        'border_count':96,'allow_writing_files':False,
        'verbose':False,'early_stopping_rounds':150,'gpu_ram_part':0.9,
        'iterations': trial.suggest_int('iterations', 250, 600),
        'depth': 4,
        'learning_rate': 0.03,
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 3, 10),
        'random_strength': trial.suggest_float('random_strength', 0.5, 1.5),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 0.6),
        'random_state':314
    }
    scores=[]
    for tr, va in cv.split(X, y_h1n1_vaccine):
        train_pool = Pool(X.iloc[tr], y_h1n1_vaccine.iloc[tr], cat_features=cat_features )
        valid_pool = Pool(X.iloc[va], y_h1n1_vaccine.iloc[va], cat_features=cat_features)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool)
        proba = model.predict_proba(valid_pool)[:,1]
        scores.append(roc_auc_score(y_h1n1_vaccine.iloc[va], proba))
    return float(np.mean(scores))

study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, timeout=1000)  # 30 минут максимум
print("Best:", study.best_trial.value, study.best_trial.params)


[I 2025-10-22 20:24:06,673] A new study created in memory with name: no-name-5d4f712c-23ed-41a0-8f0a-c0c2d138cfd4
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-10-22 20:26:39,478] Trial 0 finished with value: 0.8676789421036122 and parameters: {'iterations': 589, 'l2_leaf_reg': 7.537335916235188, 'random_strength': 0.6073189649813957, 'bagging_temperature': 0.2726446216285092}. Best is trial 0 with value: 0.8676789421036122.
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-10-22 20:27:35,152] Trial 1 finished with value: 0.866971044123952 and parameters: {'iterations': 340, 'l2_leaf_reg': 7.384251856260767, 'random_strength': 1.0283348835459702, 'bagg

Best: 0.8681294191943211 {'iterations': 466, 'l2_leaf_reg': 5.02239323156431, 'random_strength': 1.1436198894377447, 'bagging_temperature': 0.09985997546887115}
