In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb
import optuna

In [2]:
# Загрузка данных
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

In [3]:
train.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [5]:
train.describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442235,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


In [6]:
train.columns

Index(['id', 'annual_income', 'debt_to_income_ratio', 'credit_score',
       'loan_amount', 'interest_rate', 'gender', 'marital_status',
       'education_level', 'employment_status', 'loan_purpose',
       'grade_subgrade', 'loan_paid_back'],
      dtype='object')

income_to_loan — во сколько раз доход больше суммы кредита.
Пример: доход 50 000 ₽, кредит 25 000 ₽ → income_to_loan = 2.
Чем выше — тем лучше (заёмщик платёжеспособен).

payment_capacity — сколько денег остаётся после учёта долгов.
Формула: доход × (1 − доля долгов).
Пример: доход 60 000 ₽, долги 20% → 60 000 × 0.8 = 48 000 ₽.

interest_burden — сумма процентов по кредиту за год.
Формула: сумма кредита × ставка / 100.
Пример: кредит 10 000 ₽, ставка 15% → 10 000 × 15 / 100 = 1 500 ₽.

income_tier — уровень дохода (от 0 до 4).
Как: делит всех заёмщиков на 5 равных групп по доходу.
Зачем: модели проще работать с категориями, чем с числами.

credit_score_tier — уровень кредитного рейтинга (от 0 до 3).
Аналогично: разбивает заёмщиков на группы по кредитному рейтингу.

high_interest — 1, если ставка >14%, иначе 0.
Зачем: выделяет рискованные кредиты.

low_debt_ratio — 1, если долги <10% от дохода, иначе 0.
Зачем: отмечает заёмщиков с низкой долговой нагрузкой.

good_credit — 1, если кредитный рейтинг >700, иначе 0.
Зачем: выделяет надёжных заёмщиков.

risk_score — общий показатель риска (от 0 до 1).
Формула:

40% — доля долгов (debt_to_income_ratio),

30% — кредитный рейтинг (чем ниже, тем хуже),

30% — ставка по кредиту.
Чем выше — тем рискованнее заёмщик.

In [7]:
# 1. Feature Engineering
def create_features(df):
    df['income_to_loan'] = df['annual_income'] / df['loan_amount']
    df['payment_capacity'] = df['annual_income'] * (1 - df['debt_to_income_ratio'])
    df['interest_burden'] = df['loan_amount'] * df['interest_rate'] / 100
    
    df['income_tier'] = pd.cut(df['annual_income'], bins=5, labels=False)
    df['credit_score_tier'] = pd.cut(df['credit_score'], bins=4, labels=False)
    
    df['high_interest'] = (df['interest_rate'] > 14).astype(int)
    df['low_debt_ratio'] = (df['debt_to_income_ratio'] < 0.1).astype(int)
    df['good_credit'] = (df['credit_score'] > 700).astype(int)
    
    df['risk_score'] = (
        df['debt_to_income_ratio'] * 0.4 +
        (850 - df['credit_score']) / 100 * 0.3 +
        df['interest_rate'] / 20 * 0.3
    )
    return df

train = create_features(train)
test = create_features(test)

In [8]:
print("Новые столбцы в train:")
print(train.columns.tolist())


Новые столбцы в train:
['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade', 'loan_paid_back', 'income_to_loan', 'payment_capacity', 'interest_burden', 'income_tier', 'credit_score_tier', 'high_interest', 'low_debt_ratio', 'good_credit', 'risk_score']


In [9]:
# Разделение признаков и целевой переменной
X = train.drop(['id', 'loan_paid_back'], axis=1)
y = train['loan_paid_back']

In [10]:
# Определение категориальных и числовых столбцов
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

In [11]:
# Предобработка (для XGBoost и LGBM)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
    ])

In [12]:
# -----------------------------
# 1. XGBoost: Оптимизация и предсказание
# -----------------------------

def optimize_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }
    xgb_opt = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42))
    ])
    scores = cross_val_score(xgb_opt, X, y, cv=5, scoring='f1')
    return scores.mean()

print("Запуск оптимизации XGBoost...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(optimize_xgb, n_trials=10)


print("Лучшие параметры XGB:", study_xgb.best_params)

print("Лучший F1 (XGB):", study_xgb.best_value)

[I 2025-11-17 16:17:49,684] A new study created in memory with name: no-name-5a21f535-6c7d-4ac2-a601-9cf30a9e506f


Запуск оптимизации XGBoost...


[I 2025-11-17 16:18:45,943] Trial 0 finished with value: 0.9426976863426685 and parameters: {'n_estimators': 216, 'max_depth': 6, 'learning_rate': 0.13346551088789424, 'subsample': 0.9263506649109616, 'colsample_bytree': 0.8517172692335223, 'gamma': 1.2178979110914578, 'reg_alpha': 2.0805093487234094, 'reg_lambda': 2.973391358894572}. Best is trial 0 with value: 0.9426976863426685.
[I 2025-11-17 16:19:45,551] Trial 1 finished with value: 0.9423204119546709 and parameters: {'n_estimators': 391, 'max_depth': 5, 'learning_rate': 0.10286502365148718, 'subsample': 0.9812376997692933, 'colsample_bytree': 0.8986973987383988, 'gamma': 3.956989588888492, 'reg_alpha': 1.8153796977627263, 'reg_lambda': 2.8672390669819676}. Best is trial 0 with value: 0.9426976863426685.
[I 2025-11-17 16:20:31,999] Trial 2 finished with value: 0.9423338960092046 and parameters: {'n_estimators': 213, 'max_depth': 4, 'learning_rate': 0.10959516578936086, 'subsample': 0.6875831918409192, 'colsample_bytree': 0.9688265

Лучшие параметры XGB: {'n_estimators': 216, 'max_depth': 6, 'learning_rate': 0.13346551088789424, 'subsample': 0.9263506649109616, 'colsample_bytree': 0.8517172692335223, 'gamma': 1.2178979110914578, 'reg_alpha': 2.0805093487234094, 'reg_lambda': 2.973391358894572}
Лучший F1 (XGB): 0.9426976863426685


In [13]:
# Обучение финальной модели XGBoost
xgb_final = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(**study_xgb.best_params, use_label_encoder=False, eval_metric='logloss', random_state=42))
])
xgb_final.fit(X, y)

In [14]:
# Предсказание на test
pred_xgb = xgb_final.predict(test.drop('id', axis=1))

# Сохранение сабмишена
submission_xgb = pd.DataFrame({'id': test['id'], 'loan_paid_back': pred_xgb})
submission_xgb.to_csv('submission_xgb.csv', index=False)
print("Сабмишен XGBoost сохранён: submission_xgb.csv")

Сабмишен XGBoost сохранён: submission_xgb.csv


In [15]:
def optimize_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 60),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }
    lgb_opt = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', lgb.LGBMClassifier(**params, random_state=42, verbose=-1))
    ])
    scores = cross_val_score(lgb_opt, X, y, cv=5, scoring='f1')
    return scores.mean()

print("\nЗапуск оптимизации LightGBM...")
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(optimize_lgb, n_trials=10)

print("Лучшие параметры LGB:", study_lgb.best_params)
print("Лучший F1 (LGB):", study_lgb.best_value)

[I 2025-11-17 16:28:08,676] A new study created in memory with name: no-name-e1ddccb9-a453-419d-a6bc-90675e15ce37



Запуск оптимизации LightGBM...


[I 2025-11-17 16:29:15,540] Trial 0 finished with value: 0.9423881010277773 and parameters: {'n_estimators': 372, 'max_depth': 3, 'learning_rate': 0.08773099775225433, 'subsample': 0.8770798904920323, 'colsample_bytree': 0.7628560382436181, 'num_leaves': 36, 'reg_alpha': 0.19946184256085742, 'reg_lambda': 4.761184559584259}. Best is trial 0 with value: 0.9423881010277773.
[I 2025-11-17 16:30:10,341] Trial 1 finished with value: 0.9428169646302142 and parameters: {'n_estimators': 304, 'max_depth': 3, 'learning_rate': 0.19695841604396142, 'subsample': 0.6680198814450872, 'colsample_bytree': 0.9974849833363107, 'num_leaves': 34, 'reg_alpha': 0.6609099194945961, 'reg_lambda': 2.8798626942521626}. Best is trial 1 with value: 0.9428169646302142.
[I 2025-11-17 16:31:05,539] Trial 2 finished with value: 0.9427674688596775 and parameters: {'n_estimators': 297, 'max_depth': 3, 'learning_rate': 0.1764830835893819, 'subsample': 0.9085102968215079, 'colsample_bytree': 0.7287400341022335, 'num_leave

Лучшие параметры LGB: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.15557292976756287, 'subsample': 0.9231470778833313, 'colsample_bytree': 0.7181747056294457, 'num_leaves': 39, 'reg_alpha': 4.092708233977384, 'reg_lambda': 0.6671498043830093}
Лучший F1 (LGB): 0.943058835882692


In [16]:
# Обучение финальной модели LightGBM
lgb_final = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(**study_lgb.best_params, random_state=42, verbose=-1))
])
lgb_final.fit(X, y)

In [17]:
# Предсказание на test
pred_lgb = lgb_final.predict(test.drop('id', axis=1))


# Сохранение сабмишена
submission_lgb = pd.DataFrame({'id': test['id'], 'loan_paid_back': pred_lgb})
submission_lgb.to_csv('submission_lgb.csv', index=False)
print("Сабмишен LightGBM сохранён: submission_lgb.csv")

Сабмишен LightGBM сохранён: submission_lgb.csv
