In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 1. Загрузка данных (как в оригинале)
train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')
sample_submission = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv')

# 2. Подготовка данных
X = train.drop('target', axis=1)
y = train['target']
X_test = test

# 3. Разбиение
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Удаление ID_code
X_train_clean = X_train.drop('ID_code', axis=1)
X_val_clean = X_val.drop('ID_code', axis=1)
X_test_clean = X_test.drop('ID_code', axis=1)

# 5. Оптимизированная модель
model = lgb.LGBMClassifier(
    n_estimators=400,              # уменьшили с 1000 до 400
    learning_rate=0.1,             # увеличили LR для скорости
    max_depth=3,                 # упростили деревья
    num_leaves=8,               # меньше листьев
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
    scale_pos_weight=2.0
)

# 6. Исправленный callback
callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=False)]  # исправлено!


# 7. Обучение
model.fit(
    X_train_clean, y_train,
    eval_set=[(X_val_clean, y_val)],
    callbacks=callbacks
)

# 8. Прогноз и оценка
y_val_pred = model.predict_proba(X_val_clean)[:, 1]
print(f"AUC на валидации: {roc_auc_score(y_val, y_val_pred):.4f}")


# 9. Сабмишен
y_test_pred = model.predict_proba(X_test_clean)[:, 1]
submission = sample_submission.copy()
submission['target'] = y_test_pred
submission.to_csv('submission.csv', index=False)
print("Сабмишен сохранён как 'submission.csv'")


  if entities is not ():


AUC на валидации: 0.8814
Сабмишен сохранён как 'submission.csv'


In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 1. Загрузка данных (как в оригинале)
train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')
sample_submission = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv')


In [3]:
print(f"Всего образцов: {len(train)}")
print(f"Класс 0 (не совершат): {sum(train['target'] == 0)} ({sum(train['target'] == 0)/len(train):.1%})")
print(f"Класс 1 (совершат):   {sum(train['target'] == 1)} ({sum(train['target'] == 1)/len(train):.1%})")


Всего образцов: 200000
Класс 0 (не совершат): 179902 (90.0%)
Класс 1 (совершат):   20098 (10.0%)


In [4]:
features = train.drop(['ID_code', 'target'], axis=1)
print(f"Среднее по всем признакам: {features.mean().mean():.3f}")
print(f"Стд. отклонение: {features.std().mean():.3f}")
print(f"Мин: {features.min().min():.3f}, Макс: {features.max().max():.3f}")


Среднее по всем признакам: 6.767
Стд. отклонение: 4.495
Мин: -90.252, Макс: 74.032


In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

# 1. Загрузка данных
train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')
sample_submission = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv')

# 2. Подготовка данных
X = train.drop(['ID_code', 'target'], axis=1)
y = train['target']
X_test = test.drop('ID_code', axis=1)


# 3. Обработка выбросов (более щадящая)
for col in X.columns:
    low = np.percentile(X[col], 5)   # 5% вместо 1%
    high = np.percentile(X[col], 95)  # 95% вместо 99%
    X[col] = np.clip(X[col], low, high)
    X_test[col] = np.clip(X_test[col], low, high)


# 4. Улучшенная модель
def train_lgb_model(X_train, y_train, X_val, y_val, random_state):
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        num_leaves=31,
        subsample=0.7,
        colsample_bytree=0.7,
        random_state=random_state,
        verbose=-1,
        scale_pos_weight=10.0,      # Эксперимент: 10.0
        reg_alpha=0.5,             # Усиленная L1
        reg_lambda=0.5,          # Усиленная L2
        min_child_samples=15,
        min_split_gain=0.1
    )
    callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False)]
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=callbacks
    )
    return model

# 5. Кросс‑валидация (3 фолда вместо 5)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
seeds = [42, 101, 2023]
val_predictions = []
test_predictions = []

for seed in seeds:
    print(f"\nОбучение модели с random_state={seed}")
    fold_val_preds = np.zeros(len(X))
    fold_test_preds = np.zeros(len(X_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_vl = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_vl = y.iloc[train_idx], y.iloc[val_idx]
        
        model = train_lgb_model(X_tr, y_tr, X_vl, y_vl, seed)
        
        # Прогноз на валидацию
        fold_val_preds[val_idx] = model.predict_proba(X_vl)[:, 1]
        # Прогноз на тест
        fold_test_preds += model.predict_proba(X_test)[:, 1] / 3
    
    
    val_predictions.append(fold_val_preds)
    test_predictions.append(fold_test_preds)

# 6. Усреднение ансамбля
val_ensemble = np.mean(val_predictions, axis=0)
test_ensemble = np.mean(test_predictions, axis=0)

# 7. Итоговая оценка
print(f"\nAUC ансамбля (3 фолда × 3 модели): {roc_auc_score(y, val_ensemble):.4f}")

# 8. Сабмишен
submission = sample_submission.copy()
submission['target'] = test_ensemble
submission.to_csv('submission_ensemble.csv', index=False)
print("Сабмишен сохранён как 'submission_ensemble.csv'")



Обучение модели с random_state=42

Обучение модели с random_state=101

Обучение модели с random_state=2023

AUC ансамбля (3 фолда × 3 модели): 0.6786
Сабмишен сохранён как 'submission_ensemble.csv'


In [6]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 1. Загрузка данных
train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')
sample_submission = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv')


# 2. Подготовка данных
X = train.drop('target', axis=1)
y = train['target']
X_test = test

# 3. Разбиение
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Удаление ID_code
X_train_clean = X_train.drop('ID_code', axis=1)
X_val_clean = X_val.drop('ID_code', axis=1)
X_test_clean = X_test.drop('ID_code', axis=1)


# 5. Оптимизированная модель
model = lgb.LGBMClassifier(
    n_estimators=400,
    learning_rate=0.1,
    max_depth=3,
    num_leaves=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1,
    scale_pos_weight=2.0,        # Критическое исправление!
    min_child_samples=25,       # Ниже дефолта (20) → лучше ловит класс 1
    min_split_gain=0.02          # Фильтрует слабые расщепления
)

# 6. Callback
callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=False)]

# 7. Обучение
model.fit(
    X_train_clean, y_train,
    eval_set=[(X_val_clean, y_val)],
    callbacks=callbacks
)

# 8. Прогноз и оценка
y_val_pred = model.predict_proba(X_val_clean)[:, 1]
print(f"AUC на валидации: {roc_auc_score(y_val, y_val_pred):.4f}")

# 9. Сабмишен
y_test_pred = model.predict_proba(X_test_clean)[:, 1]
submission = sample_submission.copy()
submission['target'] = y_test_pred
submission.to_csv('submission.csv', index=False)
print("Сабмишен сохранён как 'submission.csv'")


AUC на валидации: 0.8807
Сабмишен сохранён как 'submission.csv'


In [7]:
print("Класс 0:", (y == 0).sum())
print("Класс 1:", (y == 1).sum())
print("Соотношение:", (y == 0).sum() / (y == 1).sum())


Класс 0: 179902
Класс 1: 20098
Соотношение: 8.951238929246692


In [8]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import optuna

# 1. Загрузка данных (как в вашем коде)
train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')
sample_submission = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv')

X = train.drop('target', axis=1)
y = train['target']
X_test = test

# 2. Разбиение данных
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_clean = X_train.drop('ID_code', axis=1)
X_val_clean = X_val.drop('ID_code', axis=1)
X_test_clean = X_test.drop('ID_code', axis=1)

# 3. Функция для Optuna
def objective(trial):
    # Пространство гиперпараметров для поиска
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 600),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'num_leaves': trial.suggest_int('num_leaves', 6, 12),
        'subsample': trial.suggest_float('subsample', 0.7, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.9),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 15, 30),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.3),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.3)
    }
    
    # Модель с текущими параметрами
    model = lgb.LGBMClassifier(
        **params,
        random_state=42,
        verbose=-1
    )
    
    callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=False)]
    
    model.fit(
        X_train_clean, y_train,
        eval_set=[(X_val_clean, y_val)],
        callbacks=callbacks
    )
    
    # Прогноз и оценка AUC
    y_val_pred = model.predict_proba(X_val_clean)[:, 1]
    auc = roc_auc_score(y_val, y_val_pred)
    
    return auc  # Optuna максимизирует это значение


# 4. Запуск оптимизации
study = optuna.create_study(direction='maximize')  # Максимизируем AUC
study.optimize(objective, n_trials=50)  # 50 итераций поиска

# 5. Лучшие параметры
print("Лучшие параметры:")
print(study.best_params)
print(f"Лучший AUC: {study.best_value:.4f}")


# 6. Обучение финальной модели на лучших параметрах
best_params = study.best_params
final_model = lgb.LGBMClassifier(**best_params, random_state=42, verbose=-1)
callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=False)]

final_model.fit(
    X_train_clean, y_train,
    eval_set=[(X_val_clean, y_val)],
    callbacks=callbacks
)

# 7. Прогноз на тестовых данных
y_test_pred = final_model.predict_proba(X_test_clean)[:, 1]
submission = sample_submission.copy()
submission['target'] = y_test_pred
submission.to_csv('submission_optimized.csv', index=False)
print("Сабмишен сохранён как 'submission_optimized.csv'")


[32m[I 2025-12-19 14:00:17,658][0m A new study created in memory with name: no-name-fbe6df60-7884-4dc3-887f-764877355d68[0m
[32m[I 2025-12-19 14:00:37,473][0m Trial 0 finished with value: 0.8875845688179448 and parameters: {'n_estimators': 317, 'learning_rate': 0.1756068416157645, 'max_depth': 4, 'num_leaves': 9, 'subsample': 0.7572410491710418, 'colsample_bytree': 0.7781443928066546, 'scale_pos_weight': 2.2433480137199853, 'min_child_samples': 17, 'min_split_gain': 0.018806534165036238, 'reg_alpha': 0.1261631899632308, 'reg_lambda': 0.0934918291671601}. Best is trial 0 with value: 0.8875845688179448.[0m
[32m[I 2025-12-19 14:01:06,331][0m Trial 1 finished with value: 0.884600863110794 and parameters: {'n_estimators': 577, 'learning_rate': 0.07972193636048515, 'max_depth': 3, 'num_leaves': 12, 'subsample': 0.7465509979959813, 'colsample_bytree': 0.8473216454781287, 'scale_pos_weight': 1.58514790023857, 'min_child_samples': 29, 'min_split_gain': 0.055173960583834974, 'reg_alpha':

Лучшие параметры:
{'n_estimators': 494, 'learning_rate': 0.15107950126304337, 'max_depth': 5, 'num_leaves': 7, 'subsample': 0.8799379556368508, 'colsample_bytree': 0.8118393226565687, 'scale_pos_weight': 1.8795314651386235, 'min_child_samples': 20, 'min_split_gain': 0.012722550909687174, 'reg_alpha': 0.028944999664103868, 'reg_lambda': 0.2453631742032807}
Лучший AUC: 0.8923
Сабмишен сохранён как 'submission_optimized.csv'
