In [59]:

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, \
    HistGradientBoostingClassifier, StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score



In [38]:
DATA_PATH = "data/data.csv"

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90.0,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82.0,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66.0,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54.0,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41.0,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32048 non-null  float64
 1   workclass       30458 non-null  object 
 2   fnlwgt          32561 non-null  int64  
 3   education       32330 non-null  object 
 4   education.num   32561 non-null  int64  
 5   marital.status  32561 non-null  object 
 6   occupation      32561 non-null  object 
 7   relationship    32561 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital.gain    32561 non-null  int64  
 11  capital.loss    32561 non-null  int64  
 12  hours.per.week  32561 non-null  int64  
 13  native.country  32561 non-null  object 
 14  income          32561 non-null  object 
dtypes: float64(1), int64(5), object(9)
memory usage: 3.7+ MB


Структура следующая:
age — Возраст
workclass — Тип занятости
fnlwgt — "примерная оценка количества людей, которое представляет каждая строка данных"
education — Уровень образования
education.num — Числовое представление уровня образования
marital.status — Семейное положение
occupation — Тип профессии
relationship — Статус отношений в семье
race — Расовая группа
sex — Пол
capital.gain — Капитальные прибыли
capital.loss — Капитальные убытки
hours.per.week — Часы работы в неделю
native.country — Страна происхождения
income — Доход

In [40]:
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
count,32048.0,30458,32561.0,32330,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,21258,,10431,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.563311,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.62888,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


Структура следующая:
age — числовой признак
workclass — категориальный признак
fnlwgt — числовой признак
education — категориальный признак
education.num — числовой признак
marital.status — категориальный признак
occupation — категориальный признак
relationship — категориальный признак
race — категориальный признак
sex — категориальный признак
capital.gain — числовой признак
capital.loss — числовой признак
hours.per.week — числовой признак
native.country — категориальный признак
income — целевой признак, категориальный: <=50K или >50K.

In [47]:
df.isna().sum()

age                513
workclass         2103
fnlwgt               0
education          231
education.num        0
marital.status       0
occupation           0
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country       0
income               0
dtype: int64

In [42]:
y = df['income']
X = df.drop(columns=['income'])

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

In [43]:
def eval_model(name, model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    acc = accuracy_score(y_valid, y_pred)
    f1_weighted = f1_score(y_valid, y_pred, average="weighted")
    f1_macro = f1_score(y_valid, y_pred, average="macro")

    print(f"{name}:")
    print(f"  accuracy      = {acc:.4f}")
    print(f"  f1_weighted   = {f1_weighted:.4f}")
    print(f"  f1_macro      = {f1_macro:.4f}")
    print()

    return {
        "name": name,
        "model": model,
        "accuracy": acc,
        "f1_weighted": f1_weighted,
        "f1_macro": f1_macro,
    }

In [44]:
models_results = []

rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

et = Pipeline([
    ("preprocess", preprocess),
    ("model", ExtraTreesClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

gb = Pipeline([
    ("preprocess", preprocess),
    ("model", GradientBoostingClassifier(
        random_state=42
    ))
])

hgb = Pipeline([
    ("preprocess", preprocess),
    ("model", HistGradientBoostingClassifier(
        random_state=42
    ))
])

models_results.append(
    eval_model("RandomForest (bagging)", rf, X_train, y_train, X_valid, y_valid)
)
models_results.append(
    eval_model("ExtraTrees (bagging)", et, X_train, y_train, X_valid, y_valid)
)
models_results.append(
    eval_model("GradientBoosting (boosting)", gb, X_train, y_train, X_valid, y_valid)
)
models_results.append(
    eval_model("HistGradientBoosting (boosting)", hgb, X_train, y_train, X_valid, y_valid)
)

RandomForest (bagging):
  accuracy      = 0.8480
  f1_weighted   = 0.8428
  f1_macro      = 0.7786

ExtraTrees (bagging):
  accuracy      = 0.8274
  f1_weighted   = 0.8230
  f1_macro      = 0.7524

GradientBoosting (boosting):
  accuracy      = 0.8578
  f1_weighted   = 0.8505
  f1_macro      = 0.7867

HistGradientBoosting (boosting):
  accuracy      = 0.8655
  f1_weighted   = 0.8609
  f1_macro      = 0.8041



In [45]:
best_result = max(models_results, key=lambda d: d["f1_weighted"])
best_model = best_result["model"]

print("Лучшая модель на валидации:")
print(best_result["name"])
print(f"accuracy = {best_result['accuracy']:.4f}")
print(f"f1_weighted = {best_result['f1_weighted']:.4f}")
print(f"f1_macro = {best_result['f1_macro']:.4f}")


Лучшая модель на валидации:
HistGradientBoosting (boosting)
accuracy = 0.8655
f1_weighted = 0.8609
f1_macro = 0.8041


In [46]:
best_model.fit(X_train, y_train)

r = permutation_importance(
    best_model,
    X_valid,
    y_valid,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

feature_names = X_valid.columns

perm_imp = pd.DataFrame({
    "feature": feature_names,
    "importance_mean": r.importances_mean,
    "importance_std": r.importances_std
}).sort_values("importance_mean", ascending=False)

perm_imp.head(20)


len(feature_names) = 14
len(importances_mean) = 14


Unnamed: 0,feature,importance_mean,importance_std
5,marital.status,0.048679,0.002242
10,capital.gain,0.045623,0.001672
4,education.num,0.03059,0.003222
0,age,0.01637,0.00159
6,occupation,0.013621,0.003037
11,capital.loss,0.01101,0.001257
12,hours.per.week,0.005851,0.001469
7,relationship,0.00476,0.001165
2,fnlwgt,0.001797,0.001086
1,workclass,0.001705,0.000945


In [61]:
candidates = {
    "RandomForest": rf,
    "ExtraTrees": et,
    "GradientBoosting": gb,
    "HistGradientBoosting": hgb,
}

param_grids = {
    "RandomForest": {
        "model__n_estimators": list(range(50, 301, 10))
    },
    "ExtraTrees": {
        "model__n_estimators": list(range(50, 301, 10))
    },
    "GradientBoosting": {
        "model__n_estimators": list(range(50, 301, 10))
    },
    "HistGradientBoosting": {
        "model__max_iter": list(range(50, 301, 10))
    },
}

In [52]:
def tune_model(name, base_model, param_grid, X_train, y_train, X_valid, y_valid):
    grid = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        scoring="f1_weighted",
        cv=3,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_params = grid.best_params_
    best_cv_score = grid.best_score_

    y_pred = best_model.predict(X_valid)
    acc_valid = accuracy_score(y_valid, y_pred)
    f1w_valid = f1_score(y_valid, y_pred, average="weighted")
    f1m_valid = f1_score(y_valid, y_pred, average="macro")

    print(f"\n=== GridSearch for {name} ===")
    print("best_params:", best_params)
    print(f"best_cv_score (f1_weighted): {best_cv_score:.4f}")
    print(f"valid accuracy = {acc_valid:.4f}")
    print(f"valid f1_weighted = {f1w_valid:.4f}")
    print(f"valid f1_macro    = {f1m_valid:.4f}")

    return {
        "name": name,
        "best_params": best_params,
        "cv_f1_weighted": best_cv_score,
        "valid_accuracy": acc_valid,
        "valid_f1_weighted": f1w_valid,
        "valid_f1_macro": f1m_valid,
        "best_model": best_model,
    }

In [53]:
tuned_results = []

for name, model in candidates.items():
    result = tune_model(
        name,
        model,
        param_grids[name],
        X_train,
        y_train,
        X_valid,
        y_valid
    )
    tuned_results.append(result)


=== GridSearch for RandomForest ===
best_params: {'model__n_estimators': 180}
best_cv_score (f1_weighted): 0.8512
valid accuracy = 0.8489
valid f1_weighted = 0.8437
valid f1_macro    = 0.7798

=== GridSearch for ExtraTrees ===
best_params: {'model__n_estimators': 260}
best_cv_score (f1_weighted): 0.8273
valid accuracy = 0.8272
valid f1_weighted = 0.8230
valid f1_macro    = 0.7526

=== GridSearch for GradientBoosting ===
best_params: {'model__n_estimators': 290}
best_cv_score (f1_weighted): 0.8685
valid accuracy = 0.8659
valid f1_weighted = 0.8602
valid f1_macro    = 0.8017


ValueError: Invalid parameter 'n_estimators' for estimator HistGradientBoostingClassifier(random_state=42). Valid parameters are: ['categorical_features', 'class_weight', 'early_stopping', 'interaction_cst', 'l2_regularization', 'learning_rate', 'loss', 'max_bins', 'max_depth', 'max_features', 'max_iter', 'max_leaf_nodes', 'min_samples_leaf', 'monotonic_cst', 'n_iter_no_change', 'random_state', 'scoring', 'tol', 'validation_fraction', 'verbose', 'warm_start'].

In [54]:
best_tuned = max(tuned_results, key=lambda d: d["valid_f1_weighted"])

print("\n=== Best tuned model on validation ===")
print("Classifier:", best_tuned["name"])
print("Best params:", best_tuned["best_params"])
print(f"CV f1_weighted = {best_tuned['cv_f1_weighted']:.4f}")
print(f"Valid accuracy = {best_tuned['valid_accuracy']:.4f}")
print(f"Valid f1_weighted = {best_tuned['valid_f1_weighted']:.4f}")
print(f"Valid f1_macro = {best_tuned['valid_f1_macro']:.4f}")

best_model_final = best_tuned["best_model"]


=== Best tuned model on validation ===
Classifier: GradientBoosting
Best params: {'model__n_estimators': 290}
CV f1_weighted = 0.8685
Valid accuracy = 0.8659
Valid f1_weighted = 0.8602
Valid f1_macro = 0.8017


In [55]:
X_train_full = pd.concat([X_train, X_valid], axis=0)
y_train_full = pd.concat([y_train, y_valid], axis=0)

best_tuned = max(tuned_results, key=lambda d: d["valid_f1_weighted"])
best_single_name = best_tuned["name"]
best_single_model = best_tuned["best_model"]

print("Лучшая одиночная модель по валидации:", best_single_name)
print("Лучшие параметры:", best_tuned["best_params"])

best_single_model.fit(X_train_full, y_train_full)

y_test_pred_single = best_single_model.predict(X_test)
acc_test_single = accuracy_score(y_test, y_test_pred_single)
f1w_test_single = f1_score(y_test, y_test_pred_single, average="weighted")
f1m_test_single = f1_score(y_test, y_test_pred_single, average="macro")

print("\n=== Single best model on TEST ===")
print(f"accuracy      = {acc_test_single:.4f}")
print(f"f1_weighted   = {f1w_test_single:.4f}")
print(f"f1_macro      = {f1m_test_single:.4f}")

Лучшая одиночная модель по валидации: GradientBoosting
Лучшие параметры: {'model__n_estimators': 290}

=== Single best model on TEST ===
accuracy      = 0.8678
f1_weighted   = 0.8640
f1_macro      = 0.8092


In [56]:
sorted_results = sorted(
    tuned_results,
    key=lambda d: d["valid_f1_weighted"],
    reverse=True
)

top_k = 3
base_estimators = [
    (res["name"], res["best_model"])
    for res in sorted_results[:top_k]
]

print("\nБазовые модели для стекинга:")
for name, _ in base_estimators:
    print(" -", name)



Базовые модели для стекинга:
 - GradientBoosting
 - RandomForest
 - ExtraTrees


In [60]:
stack_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=-1
)

stack_clf.fit(X_train_full, y_train_full)

y_test_pred_stack = stack_clf.predict(X_test)
acc_test_stack = accuracy_score(y_test, y_test_pred_stack)
f1w_test_stack = f1_score(y_test, y_test_pred_stack, average="weighted")
f1m_test_stack = f1_score(y_test, y_test_pred_stack, average="macro")

print("\n=== Stacking model on TEST ===")
print(f"accuracy      = {acc_test_stack:.4f}")
print(f"f1_weighted   = {f1w_test_stack:.4f}")
print(f"f1_macro      = {f1m_test_stack:.4f}")



=== Stacking model on TEST ===
accuracy      = 0.8686
f1_weighted   = 0.8650
f1_macro      = 0.8108


In [62]:
print("\n=== Сравнение одиночной модели и стекинга (TEST) ===")
print(f"Single best: accuracy={acc_test_single:.4f}, f1_weighted={f1w_test_single:.4f}, f1_macro={f1m_test_single:.4f}")
print(f"Stacking   : accuracy={acc_test_stack:.4f}, f1_weighted={f1w_test_stack:.4f}, f1_macro={f1m_test_stack:.4f}")


=== Сравнение одиночной модели и стекинга (TEST) ===
Single best: accuracy=0.8678, f1_weighted=0.8640, f1_macro=0.8092
Stacking   : accuracy=0.8686, f1_weighted=0.8650, f1_macro=0.8108
