In [150]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns   
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
from optuna.pruners import MedianPruner
from catboost import Pool
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold


In [119]:
#%pip install -U category_encoders
#%pip install optuna

For this competition, there are two target variables:

- h1n1_vaccine - Whether respondent received H1N1 flu vaccine.

- seasonal_vaccine - Whether respondent received seasonal flu vaccine.

Both are binary variables: 0 = No; 1 = Yes. Some respondents didn't get either vaccine, others got only one, and some got both. This is formulated as a multilabel (and not multiclass) problem

The features in this dataset

You are provided a dataset with 36 columns. The first column respondent_id is a unique and random identifier. The remaining 35 features are described below.

For all binary variables: 0 = No; 1 = Yes.

- h1n1_concern - Level of concern about the H1N1 flu.
0 = Not at all concerned; 
1 = Not very concerned; 
2 = Somewhat concerned; 
3 = Very concerned.

- h1n1_knowledge - Level of knowledge about H1N1 flu.
0 = No knowledge; 1 = A little knowledge; 2 = A lot of knowledge.

- behavioral_antiviral_meds - Has taken antiviral medications. (binary)

- behavioral_avoidance - Has avoided close contact with others with flu-like symptoms. (binary)

- behavioral_face_mask - Has bought a face mask. (binary)

- behavioral_wash_hands - Has frequently washed hands or used hand sanitizer. (binary)

- behavioral_large_gatherings - Has reduced time at large gatherings. (binary)

- behavioral_outside_home - Has reduced contact with people outside of own household. (binary)

- behavioral_touch_face - Has avoided touching eyes, nose, or mouth. (binary)

- doctor_recc_h1n1 - H1N1 flu vaccine was recommended by doctor. (binary)

- doctor_recc_seasonal - Seasonal flu vaccine was recommended by doctor. (binary)

- chronic_med_condition - Has any of the following chronic medical conditions: asthma or an other lung condition, diabetes, a heart condition, a kidney condition, sickle cell anemia or other anemia, a neurological or neuromuscular condition, a liver condition, or a weakened immune system caused by a chronic illness or by medicines taken for a chronic illness. (binary)

- child_under_6_months - Has regular close contact with a child under the age of six months. (binary)
health_worker - Is a healthcare worker. (binary)
health_insurance - Has health insurance. (binary)
opinion_h1n1_vacc_effective - Respondent's opinion about H1N1 vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.

- opinion_h1n1_risk - Respondent's opinion about risk of getting sick with H1N1 flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.

- opinion_h1n1_sick_from_vacc - Respondent's worry of getting sick from taking H1N1 vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.

- opinion_seas_vacc_effective - Respondent's opinion about seasonal flu vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.

- opinion_seas_risk - Respondent's opinion about risk of getting sick with seasonal flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.

- opinion_seas_sick_from_vacc - Respondent's worry of getting sick from taking seasonal flu vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.

- age_group - Age group of respondent.

- education - Self-reported education level.

- race - Race of respondent.

- sex - Sex of respondent.

- income_poverty - Household annual income of respondent with respect to 2008 Census poverty thresholds.

- marital_status - Marital status of respondent.

- rent_or_own - Housing situation of respondent.

- employment_status - Employment status of respondent.

- hhs_geo_region - Respondent's residence using a 10-region geographic classification defined by the U.S. Dept. of Health and Human Services. Values are represented as short random character strings.
census_msa - Respondent's residence within metropolitan statistical areas (MSA) as defined by the U.S. Census.

- household_adults - Number of other adults in household, top-coded to 3.

- household_children - Number of children in household, top-coded to 3.

- employment_industry - Type of industry respondent is employed in. Values are represented as short random character strings.

- employment_occupation - Type of occupation of respondent. Values are represented as short random character strings.



In [120]:
submission_format = pd.read_csv('data/submission_format.csv')  
submission_format.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7


submission_format в целом для нас как я понял это бесполезный файл, с ним работать не прийдется. Это файл пример для сдачи как я понял

In [121]:
training_set_features = pd.read_csv('data/training_set_features.csv')  
training_set_labels = pd.read_csv('data/training_set_labels.csv')  
print(len(training_set_features), len(training_set_labels))
len_tr_set = len(training_set_features)

26707 26707


In [122]:
training_set_features.head(3)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo


In [123]:
training_set_labels.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0


In [124]:
training_set_labels.isna().sum()

respondent_id       0
h1n1_vaccine        0
seasonal_vaccine    0
dtype: int64

In [125]:
training_set_features.isna().sum()

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

явно видно что у нас большое количество фичей с Nan значениеми, так что надо от них избавиться

In [126]:
for i in training_set_features.columns:
    t = training_set_features[i].isna().sum() / len_tr_set
    if t < 0.03: 
        print(f"{i} ---- 1 ---- {training_set_features[i].isna().sum() / len_tr_set:.3f}%")
    elif t < 0.1:
        print(f"{i} ---- 2 ---- {training_set_features[i].isna().sum() / len_tr_set:.3f}%")
    else:
        print(f"{i} ---- 3 ---- {training_set_features[i].isna().sum() / len_tr_set:.3f}% !!!!!!!!!!!!")

respondent_id ---- 1 ---- 0.000%
h1n1_concern ---- 1 ---- 0.003%
h1n1_knowledge ---- 1 ---- 0.004%
behavioral_antiviral_meds ---- 1 ---- 0.003%
behavioral_avoidance ---- 1 ---- 0.008%
behavioral_face_mask ---- 1 ---- 0.001%
behavioral_wash_hands ---- 1 ---- 0.002%
behavioral_large_gatherings ---- 1 ---- 0.003%
behavioral_outside_home ---- 1 ---- 0.003%
behavioral_touch_face ---- 1 ---- 0.005%
doctor_recc_h1n1 ---- 2 ---- 0.081%
doctor_recc_seasonal ---- 2 ---- 0.081%
chronic_med_condition ---- 2 ---- 0.036%
child_under_6_months ---- 2 ---- 0.031%
health_worker ---- 2 ---- 0.030%
health_insurance ---- 3 ---- 0.460% !!!!!!!!!!!!
opinion_h1n1_vacc_effective ---- 1 ---- 0.015%
opinion_h1n1_risk ---- 1 ---- 0.015%
opinion_h1n1_sick_from_vacc ---- 1 ---- 0.015%
opinion_seas_vacc_effective ---- 1 ---- 0.017%
opinion_seas_risk ---- 1 ---- 0.019%
opinion_seas_sick_from_vacc ---- 1 ---- 0.020%
age_group ---- 1 ---- 0.000%
education ---- 2 ---- 0.053%
race ---- 1 ---- 0.000%
sex ---- 1 ---- 0.000

employment_occupation и employment_industry, health_insurance  тк у них +- 50% Nan я удаляю

In [127]:
df = pd.merge(training_set_features, training_set_labels, on='respondent_id', how='inner')
df.head(5)


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [128]:
df = df.drop(columns=['respondent_id', 'employment_occupation', 'employment_industry', 'health_insurance'])
df.head(3)

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,0,0


In [129]:
df['h1n1_vaccine'].value_counts()

h1n1_vaccine
0    21033
1     5674
Name: count, dtype: int64

In [130]:
df.isna().sum()

h1n1_concern                     92
h1n1_knowledge                  116
behavioral_antiviral_meds        71
behavioral_avoidance            208
behavioral_face_mask             19
behavioral_wash_hands            42
behavioral_large_gatherings      87
behavioral_outside_home          82
behavioral_touch_face           128
doctor_recc_h1n1               2160
doctor_recc_seasonal           2160
chronic_med_condition           971
child_under_6_months            820
health_worker                   804
opinion_h1n1_vacc_effective     391
opinion_h1n1_risk               388
opinion_h1n1_sick_from_vacc     395
opinion_seas_vacc_effective     462
opinion_seas_risk               514
opinion_seas_sick_from_vacc     537
age_group                         0
education                      1407
race                              0
sex                               0
income_poverty                 4423
marital_status                 1408
rent_or_own                    2042
employment_status           

In [None]:
target_col = ['h1n1_vaccine', 'seasonal_vaccine']  # или 'seasonal_vaccine'

ordinal_cols = [
    'h1n1_concern','h1n1_knowledge',
    'opinion_h1n1_risk','opinion_h1n1_vacc_effective',
    'opinion_seas_risk','opinion_seas_vacc_effective',
    'opinion_h1n1_sick_from_vacc','opinion_seas_sick_from_vacc'
]
bool_cols = [
    'behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask',
    'behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
    'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal',
    'chronic_med_condition','child_under_6_months','health_worker','health_insurance'
]
cat_nominal = [
    'age_group','education','race','sex','income_poverty',
    'marital_status','rent_or_own','employment_status',
    'hhs_geo_region','census_msa'
]


# новый признак"обе рекомендации врача"
df['doctor_recc_both'] = ((df['doctor_recc_h1n1']==1) & (df['doctor_recc_seasonal']==1)).astype('float')
# новый признак "знание × беспокойство"
df['knowledge_x_concern'] = df['h1n1_knowledge'] * df['h1n1_concern']


In [161]:
df_free_nan = df.copy()
df_free_nan = df_free_nan.dropna()

y_h1n1_vaccine= df_free_nan['h1n1_vaccine'].astype(int)
y_seasonal_vaccine = df_free_nan['seasonal_vaccine'].astype(int)
df_free_nan = df_free_nan.drop(columns=['h1n1_vaccine','seasonal_vaccine'])
X_free_nan = df_free_nan

In [162]:
X_free_nan

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,doctor_recc_both,knowledge_x_concern
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,0.0,6.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,0.0,1.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,0.0,2.0
5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,atmpeygn,"MSA, Principle City",2.0,3.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26700,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,"> $75,000",Married,Own,Not in Labor Force,lzgpxyit,"MSA, Principle City",1.0,0.0,0.0,3.0
26701,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Rent,Not in Labor Force,fpwskwrf,"MSA, Principle City",3.0,0.0,0.0,4.0
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,1.0,2.0


In [148]:
X_free_nan.isna().sum()

h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_region                 0
census_msa                     0
household_

In [149]:
#надо сделать два набора данных для катбуста и хдбуста

In [142]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve

# Индексы категориальных колонок для CatBoost
cat_features = [X.columns.get_loc(c) for c in cat_nominal if c in X.columns]

# Стратифицированная K-fold валидация
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=314)


In [None]:

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=314)
pruner = MedianPruner(n_warmup_steps=3)

def objective(trial):
    params = {
        'loss_function':'Logloss','eval_metric':'AUC',
        'task_type':'GPU','devices':'0','bootstrap_type':'Bayesian',
        'border_count':96,'allow_writing_files':False,
        'verbose':False,'early_stopping_rounds':150,'gpu_ram_part':0.9,
        'iterations': trial.suggest_int('iterations', 250, 600),
        'depth': 4,
        'learning_rate': 0.03,
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 3, 10),
        'random_strength': trial.suggest_float('random_strength', 0.5, 1.5),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 0.6),
        'random_state':314
    }
    scores=[]
    for tr, va in cv.split(X, y):
        train_pool = Pool(X.iloc[tr], y.iloc[tr], cat_features=cat_features)
        valid_pool = Pool(X.iloc[va], y.iloc[va], cat_features=cat_features)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool)
        proba = model.predict_proba(valid_pool)[:,1]
        scores.append(roc_auc_score(y.iloc[va], proba))
    return float(np.mean(scores))

study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, timeout=1800)  # 30 минут максимум
print("Best:", study.best_trial.value, study.best_trial.params)


[I 2025-10-22 17:28:37,228] A new study created in memory with name: no-name-4e042d77-4e5b-4b77-9032-9215ac3d8a79
Default metric period is 5 because AUC is/are not implemented for GPU
[W 2025-10-22 17:28:43,101] Trial 0 failed with parameters: {'iterations': 294, 'l2_leaf_reg': 9.416406100709356, 'random_strength': 1.27814777813718, 'bagging_temperature': 0.5828983457094133} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\erik0\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\erik0\AppData\Local\Temp\ipykernel_84188\2263429841.py", line 28, in objective
    model.fit(train_pool, eval_set=valid_pool)
  File "c:\Users\erik0\AppData\Local\Programs\Python\Python310\lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, N

KeyboardInterrupt: 

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify=y, random_state=314)

In [145]:
bast_catboost_peram = {'iterations': 494, 
                       'l2_leaf_reg': 4.166584533331385, 
                       'random_strength': 1.3961843651116557,
                         'bagging_temperature': 0.5731602631072922}


In [None]:
X_cat

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,opinion_seas_risk_is_na,opinion_seas_vacc_effective_is_na,opinion_h1n1_sick_from_vacc_is_na,opinion_seas_sick_from_vacc_is_na,concern_risk_sum,concern_risk_mean,vacc_effective_sum,doctor_recc_both,knowledge_x_concern,opinion_na_count
0,1.0,0.0,,,,,,,,,...,0.0,0.0,0.0,0.0,3.0,1.000000,5.0,0.0,0.0,0
1,3.0,2.0,,,,,,,,,...,0.0,0.0,0.0,0.0,9.0,3.000000,9.0,0.0,6.0,0
2,1.0,1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,3.0,1.000000,7.0,0.0,1.0,0
3,1.0,1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,8.0,2.666667,8.0,0.0,1.0,0
4,2.0,1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,6.0,2.000000,6.0,0.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,,,,,,,,,...,0.0,0.0,0.0,0.0,5.0,1.666667,8.0,0.0,0.0,0
26703,1.0,2.0,,,,,,,,,...,0.0,0.0,0.0,0.0,4.0,1.333333,9.0,0.0,2.0,0
26704,2.0,2.0,,,,,,,,,...,0.0,0.0,0.0,0.0,10.0,3.333333,9.0,0.0,4.0,0
26705,1.0,1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,3.0,1.000000,5.0,0.0,1.0,0


In [None]:
idx_train, idx_temp = train_test_split(X.index, test_size=0.4, stratify=y, random_state=314)
idx_val, idx_test = train_test_split(idx_temp, test_size=0.5, stratify=y.loc[idx_temp], random_state=314)

Xcat_train, y_train = X_cat.loc[idx_train], y.loc[idx_train]
Xcat_val,   y_val   = X_cat.loc[idx_val],   y.loc[idx_val]
Xcat_test,  y_test  = X_cat.loc[idx_test],  y.loc[idx_test]

Xxgb_train = X_xgb.loc[idx_train]
Xxgb_val   = X_xgb.loc[idx_val]
Xxgb_test  = X_xgb.loc[idx_test]

In [None]:
posw = (y_train==0).sum() / (y_train==1).sum()

params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=4,
    min_child_weight=7,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    reg_alpha=0.4,
    gamma=1.0,
    eta=0.04,               # аналог learning_rate
    n_estimators=1500,      # с ранней остановкой
    tree_method='hist',     # быстро на CPU
    n_jobs=-1,
    random_state=314,
    scale_pos_weight=posw   # для дисбаланса
)

xgb = XGBClassifier(**params)
xgb.fit(Xcat_train, y_train)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:age_group: object, education: object, race: object, sex: object, income_poverty: object, marital_status: object, rent_or_own: object, employment_status: object, hhs_geo_region: object, census_msa: object

In [None]:
y_proba = xgb.predict_proba(X_test)[:, 1]
ths = np.linspace(0.1, 0.9, 100)
f1s = [f1_score(y_test, (y_proba >= t).astype(int)) for t in ths]
best_t = ths[int(np.argmax(f1s))]
print("Best threshold:", best_t, "F1:", max(f1s), "AUC:", roc_auc_score(y_test, y_proba, average="macro"))