In [4]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, randint

In [5]:
train = pd.read_csv("train_processed.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13453 entries, 0 to 13452
Data columns (total 39 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          13453 non-null  int64  
 1   feature_01  13453 non-null  float64
 2   feature_02  11481 non-null  float64
 3   feature_03  13453 non-null  int64  
 4   feature_04  11433 non-null  float64
 5   feature_05  13453 non-null  int64  
 6   feature_06  11403 non-null  float64
 7   feature_07  13453 non-null  int64  
 8   feature_08  13453 non-null  int64  
 9   feature_09  13453 non-null  float64
 10  feature_10  13453 non-null  float64
 11  feature_11  13453 non-null  int64  
 12  feature_12  13453 non-null  int64  
 13  feature_13  13453 non-null  int64  
 14  feature_14  11414 non-null  float64
 15  feature_15  13453 non-null  float64
 16  feature_16  13453 non-null  float64
 17  feature_17  13453 non-null  float64
 18  feature_18  13453 non-null  float64
 19  feature_19  13453 non-nul

In [6]:
train

Unnamed: 0,id,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,...,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,target
0,6739,63.649596,156.0,966898,966930.0,386,,22,39202,86.700831,...,117.994612,0.668238,0.815287,,,Very_High,Medium,Cat_039,unknown,0
1,13633,1077.950732,1122.0,2401744,2401760.0,106,0.015,11,-999,103.129709,...,123.397491,0.868849,0.987009,0.023715,-0.046797,Very_High,Medium,Cat_033,Type_A,0
2,17450,775.697727,756.0,152117,152274.0,144,0.024,12,15517,94.270979,...,115.789506,0.579598,0.588482,0.031276,0.062870,High,Medium,Cat_000,Type_D,0
3,19017,1249.671295,,1154994,1155025.0,168,0.016,20,-999,91.376335,...,122.521749,0.508570,0.402647,0.017443,0.079964,Very_High,Medium,Cat_017,Type_A,0
4,12119,1297.659733,1296.0,467888,467902.0,97,0.008,14,9693,79.828246,...,103.576787,0.882215,0.849826,0.028379,0.003948,Very_High,Medium,Cat_019,Type A+,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13448,1293,809.689074,796.0,1388292,1388304.0,34,0.007,4,4409,103.023377,...,121.222489,1.016905,0.928651,0.011469,-0.028761,Very_High,Medium,Cat_043,unknown,1
13449,5148,43.010153,216.0,2087881,2087946.0,6515,0.257,123,684236,44.059624,...,120.892421,0.621096,0.517800,0.251611,0.186348,Medium,Low,Cat_048,type_b_special,1
13450,4528,749.519408,740.0,2195695,2195707.0,80,,17,9155,100.935469,...,121.705560,0.664470,0.603057,,,High,Medium,Cat_028,Type A+,0
13451,5817,1146.829348,1187.0,9228,9246.0,770,0.059,87,79690,85.636994,...,132.534438,0.186090,0.048947,0.062652,0.029598,Very_High,Medium,Cat_017,type_b_special,0


In [7]:
train = train.replace("unknown", np.nan)

In [8]:
def create_auto_preprocessor(X: pd.DataFrame):
    """
    Автоматически создает препроцессор с обработкой NaN
    """

    # Автоматически определяем типы колонок
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(
        include=["object", "category"]
    ).columns.tolist()

    print(f"Числовые колонки: {numeric_features}")
    print(f"Категориальные колонки: {categorical_features}")

    # Проверяем наличие NaN
    numeric_with_nan = [
        col for col in numeric_features if X[col].isnull().any()
    ]
    categorical_with_nan = [
        col for col in categorical_features if X[col].isnull().any()
    ]

    print(f"Числовые колонки с NaN: {numeric_with_nan}")
    print(f"Категориальные колонки с NaN: {categorical_with_nan}")

    # Создаем трансформеры
    transformers = []

    transformers.append(
        (
            "num",
            Pipeline(
                [
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler()),
                ]
            ),
            numeric_features,
        )
    )

    transformers.append(
        (
            "cat",
            Pipeline(
                [
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    (
                        "encoder",
                        OneHotEncoder(handle_unknown="ignore"),
                    ),
                ]
            ),
            categorical_features,
        )
    )

    preprocessor = ColumnTransformer(transformers=transformers)
    return preprocessor

In [9]:
X = train.drop("target", axis=1)
Y = train["target"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, train_size=0.8, test_size=0.2, random_state=42
)

In [10]:
preprocessor = create_auto_preprocessor(X)

Числовые колонки: ['id', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33']
Категориальные колонки: ['feature_34', 'feature_35', 'feature_36', 'feature_37']
Числовые колонки с NaN: ['feature_02', 'feature_04', 'feature_06', 'feature_14', 'feature_32', 'feature_33']
Категориальные колонки с NaN: ['feature_37']


In [None]:
import lightgbm as lgb
from sklearn.ensemble import BaggingClassifier

bagging_lgb = BaggingClassifier(
    estimator=lgb.LGBMClassifier(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=6,
        reg_alpha=0.1,
        reg_lambda=3,
        random_state=42,
    ),
    n_estimators=10,  # 10 моделей в ансамбле
    max_samples=0.8,  # 80% данных для каждой модели
    max_features=0.8,  # 80% признаков для каждой модели
    bootstrap=True,  # Выборка с возвращением
    bootstrap_features=False,  # Без возвращения для признаков
    random_state=42,
    n_jobs=-1,  # Использовать все ядра
)


ml_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", bagging_lgb),
    ]
)

In [16]:
ml_pipeline.fit(X_train, Y_train)

# Оценка модели
train_score = ml_pipeline.score(X_train, Y_train)
test_score = ml_pipeline.score(X_test, Y_test)

print(f"Точность на тренировочных данных: {train_score:.3f}")
print(f"Точность на тестовых данных: {test_score:.3f}")

[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Number of positive: 3959, number of negative: 6803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.091270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6339
[LightGBM] [Info] Number of data points in the train set: 10762, number of used features: 80
[LightGBM] [Info] Aut



Точность на тренировочных данных: 0.838
Точность на тестовых данных: 0.799




In [49]:
test = pd.read_csv("test_processed.csv")

In [50]:
test_pred = ml_pipeline.predict(test)



In [51]:
res_df = pd.DataFrame()

res_df["id"] = test["id"]
res_df["target"] = test_pred

res_df.to_csv("test_result.csv", index=False)