# Установка зависимостей

Версия Python: 3.11.5

In [None]:
%pip install pandas==2.3.0

In [None]:
%pip install numpy==1.26.2

In [None]:
%pip install scikit-learn==1.7.0

In [None]:
%pip install matplotlib==3.10.3

In [None]:
%pip install seaborn==0.13.2

In [None]:
%pip install catboost==1.2.8

Collecting catboost==1.2.8
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
%pip install xgboost==3.0.2

# Обработка данных

Импорт зависимостей и соединение данных main_df и mcc_operations_df

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor, Pool, cv
import xgboost as xgb
from scipy.stats import randint, uniform

#Чтение файлов
target = pd.read_csv('train_target.csv')
train_main = pd.read_parquet('train_main_df.parquet')
test_main = pd.read_parquet('test_main_df.parquet')
test_mcc = pd.read_parquet('test_mcc_operations_df.parquet')
train_mcc = pd.read_parquet('train_mcc_operations_df.parquet')

#Самые важные признаки из mcc_operations_df
mcc_features = ['sum_most_popular_mcc_5699_12m', 'sum_most_popular_mcc_5699_6m',
                'sum_most_popular_mcc_6012_3m', 'sum_most_popular_mcc_6012_6m',
                'sum_most_popular_mcc_5814_12m', 'sum_most_popular_mcc_5422_3m',
                'cnt_most_popular_mcc_9222_6m', 'sum_most_popular_mcc_4814_6m',
                'cnt_most_popular_mcc_9222_12m', 'sum_most_popular_mcc_5993_6m',
                'cnt_most_popular_mcc_5422_3m', 'cnt_most_popular_mcc_5422_6m',
                'sum_most_popular_mcc_5993_3m', 'cnt_most_popular_mcc_5814_1m',
                'sum_most_popular_mcc_6011_3m']

#Объединение датафреймов
train = train_main.merge(
    train_mcc[mcc_features],
    left_index=True,
    right_index=True,
    how='left'
)
test = test_main.merge(
    test_mcc[mcc_features],
    left_index=True,
    right_index=True,
    how='left'
)

##Предварительный анализ признаков

Вывод признаков с их значением дисперсии для их отбора

In [None]:
# Вычисляем дисперсию признаков
variances = train[list(train.select_dtypes(include=['number']).columns)].var()

# Признаки с дисперсией больше 0.95
high_variance_features = variances[variances > 0.95]

# Признаки с дисперсией меньше 0.01
low_variance_features = variances[variances < 0.01]

# Вывод
print("Признаки с дисперсией больше 0.95:")
print(high_variance_features)

print("\nПризнаки с дисперсией меньше 0.01:")
print(low_variance_features)

# Хотели отбросить некоторые признаки с дисперсией больше 0.95, но они влияли на результат, поэтому не успели решить

Признаки с дисперсией больше 0.95:
app_family_cnt                                  1.203671e+00
app_income_app                                  2.351563e+11
avg_dep_avg_balance_12month_amt                 1.106580e+14
avg_dep_avg_balance_12month_amt_term            3.257982e+14
avg_dep_avg_balance_12month_amt_term_savings    1.222734e+14
                                                    ...     
cnt_most_popular_mcc_5422_3m                    1.661279e+00
cnt_most_popular_mcc_5422_6m                    4.870918e+00
sum_most_popular_mcc_5993_3m                    2.505329e+07
cnt_most_popular_mcc_5814_1m                    9.671481e+01
sum_most_popular_mcc_6011_3m                    3.390777e+11
Length: 259, dtype: float64

Признаки с дисперсией меньше 0.01:
app_real_estate_ind     0.005932
cnt_account_5y          0.005297
used_car_flg            0.003835
zp_flag_2month          0.000000
zp_flag_3month          0.000000
zp_flag_6month          0.000000
zp_flag_9month          0.000000

Постройка матрицы корреляций для отбора признаков

In [None]:
corr_matrix = train[list(train.select_dtypes(include=['number']).columns)].corr().abs()

In [None]:
threshold = 0.8

# Находим пары признаков с корреляцией выше порога
high_corr_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns
                   if i != j and corr_matrix.loc[i, j] > threshold]

# Выводим результаты
print("Признаки с высокой корреляцией (> 0.8):")
for pair in high_corr_pairs:
    print(pair)


Признаки с высокой корреляцией (> 0.8):
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_12month_amt_term')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_12month_amt_term_savings')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_1month_amt')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_1month_amt_term')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_1month_amt_term_savings')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_3month_amt')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_3month_amt_term')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_3month_amt_term_savings')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_6month_amt')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_6month_amt_term')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_6month_amt_term_savings')
('avg_dep_avg_balance_12month_amt', 'avg_dep_avg_balance_fact_12month_amt')
('avg_dep_avg_balance_12month_amt', 'avg

##Работа над признаками

Обработка main_df

In [None]:

# Признаки для OneHot-кодирования
categorical_features = ['salary_flg', 'gender_nm', 'current_work_experience_nm', 'savings_service_model_cd']

# Создаем экземпляр OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' для избежания дамми-ловушки

# Применяем кодировщик к выбранным признакам
encoded_features = encoder.fit_transform(train[categorical_features])

# Получаем имена новых колонок
feature_names = encoder.get_feature_names_out(categorical_features)

# Создаем DataFrame с закодированными признаками
encoded_df = pd.DataFrame(encoded_features, columns=feature_names)

# Объединяем с исходным DataFrame
train = pd.concat([train, encoded_df], axis=1)

#Теперь для test
encoded_test_features = encoder.fit_transform(test[categorical_features])
encoded_test_df = pd.DataFrame(encoded_test_features, columns=feature_names)
test = pd.concat([test, encoded_test_df], axis=1)

Функция для создания синтетических признаков

In [None]:
def create_new_features(df : pd.DataFrame) -> pd.DataFrame :


    #Средний баланс за последние 3, 6, 12 месяцев
    df['avg_balance_3m'] = df[['avg_dep_avg_balance_1month_amt',
                               'avg_dep_avg_balance_3month_amt']].mean(axis=1)
    df['avg_balance_6m'] = df[['avg_dep_avg_balance_3month_amt',
                               'avg_dep_avg_balance_6month_amt']].mean(axis=1)
    df['avg_balance_12m'] = df[['avg_dep_avg_balance_6month_amt',
                                'avg_dep_avg_balance_12month_amt']].mean(axis=1)

    #Изменение баланса за последние месяцы
    df['balance_change_1m'] = df['avg_dep_avg_balance_1month_amt'] - df['avg_dep_avg_balance_3month_amt']
    df['balance_change_3m'] = df['avg_dep_avg_balance_3month_amt'] - df['avg_dep_avg_balance_6month_amt']
    df['balance_change_6m'] = df['avg_dep_avg_balance_6month_amt'] - df['avg_dep_avg_balance_12month_amt']

    #Тренды по сбережениям
    df['savings_trend_3m'] = df['savings_sum_dep_now'] - df['savings_sum_dep_3m']
    df['savings_trend_6m'] = df['savings_sum_dep_now'] - df['savings_sum_dep_6m']

    #Флаги крупных операций
    df['large_deposit_flag'] = np.where(df['max_max_dep_income_amt'] > df['avg_dep_avg_balance_12month_amt'], 1, 0)

    return df
train = create_new_features(train)
test = create_new_features(test)

Отбор числовых признаков и синтетических

In [None]:
num_features = list(train.select_dtypes(include=['number']).columns)

Обработка тренировочной и тестовой выборки по квантилям

In [None]:
for col in num_features:
    q1 = train[col].quantile(0.01)
    q99 = train[col].quantile(0.99)
    train[col] = train[col].clip(q1, q99)

# Первоначальные модели

In [None]:
#Датасет и таргет для тренировочных данных
X_train = train[num_features]
y_train = np.log1p(target['target'])

In [None]:
#Разделение выборки на тренировочную и валидационную
X_train, X_valid, y_train, y_valid = train_test_split(train[num_features], np.log1p(target['target']), test_size=0.3, random_state=42, shuffle=False)

Линейная регрессия, Лассо регрессия, регрессионное дерево

In [None]:
# Линейная регрессия
model =  LinearRegression()
model.fit(X_train.fillna(0), y_train)
test_predict = model.predict(X_valid.fillna(0))
y_pred = np.expm1(test_predict)
y_pred = np.clip(y_pred, 0, None)
print("RMSLE:", np.sqrt(mean_squared_log_error(y_valid, y_pred)))

# Лассо регрессия
lasso = Lasso(alpha=0.001, fit_intercept=True,  max_iter=500,  warm_start=True, random_state=None, selection='random')
lasso.fit(X_train.fillna(0), y_train)
test_predict = lasso.predict(X_valid.fillna(0))
y_pred = np.expm1(test_predict)
y_pred = np.clip(y_pred, 0, None)
print("RMSLE:", np.sqrt(mean_squared_log_error(y_valid, y_pred)))

# Дерево решений
tree = DecisionTreeRegressor(max_depth=20, min_samples_leaf=20, min_samples_split=300)
tree.fit(X_train, y_train)
predictions = tree.predict(X_valid)
y_pred = np.exp(predictions) - 1
rmsle = np.sqrt(mean_squared_log_error(y_valid, y_pred))
print(f'RMSLE: {rmsle}')

RMSLE: 5.185621076234474


  model = cd_fast.enet_coordinate_descent(


RMSLE: 5.270247554063773
RMSLE: 6.083623496112257


Подбор параметров для XGB

In [None]:
param_grid = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 120, 200],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5]
}

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)

Подбор параметров для CatBoost

In [None]:
param_dist = {
    'iterations': randint(400, 1000),
    'depth': randint(4, 16),
    'learning_rate': uniform(0.0001, 0.3),
    'random_strength': uniform(0.1, 10),
    'bagging_temperature': uniform(0, 0.5),
    'border_count': randint(10, 100),
    'loss_function': ['RMSE'],
}
model = CatBoostRegressor(verbose=0)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='neg_mean_squared_log_error',
    random_state=42,
    n_jobs=-1,
    refit=True
)

random_search.fit(X_train, y_train)

cat_boost = CatBoostRegressor(
    bagging_temperature = np.float64(0.020584494295802447),
    border_count = 33,
    l2_leaf_reg = 4,
    random_strength = np.float64(10.022115592912174),
    random_state=42,
    iterations=500,
    learning_rate= np.float64(0.010233629752304298),
    depth=11,
    loss_function='RMSE',
    subsample=0.7
    )

print("Лучшие параметры:", random_search.best_params_)

# Результат не улучшился, но модель стала обучаться дольше

XGBoost с лучшими параметрами

In [None]:
X_train = train[num_features]
y_train = np.log1p(target['target'])
modelxgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=120,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    tree_method='hist',
    n_jobs=-1
)
modelxgb.fit(X_train, y_train)
predictions = modelxgb.predict(X_valid[num_features])
y_pred = np.exp(predictions) - 1

rmsle = np.sqrt(mean_squared_log_error(y_valid, y_pred))
print(f'RMSLE: {rmsle}')

RMSLE: 5.943491292509077


Прототип Stacking Regressor'а

In [None]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('xgb', xgb.XGBRegressor(max_depth=5, learning_rate=0.01)),
    ('cat', CatBoostRegressor(verbose=False))
]

model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
model.fit(X_train.fillna(0), y_train)
predictions = model.predict(X_valid.fillna(0))
y_pred = np.exp(predictions) - 1

rmsle = np.sqrt(mean_squared_log_error(y_valid, y_pred))
print(f'RMSLE: {rmsle}')

RMSLE: 6.034147511205053


##Анализ данных после обучения моделей

Оценка важности признаков для модели xgb

In [None]:
# Получаем и сортируем важность признаков
sorted_features = sorted(zip(X_train.columns, modelxgb.feature_importances_),
                       key=lambda x: x[1],
                       reverse=True)

# Фильтруем и выводим только названия с важностью >= 0.001
for feature, importance in sorted_features:
    if importance < 0.001:
        print(f'"{feature}", ')

# Отбрасываем признаки с важностью < 0.001

"sum_dep_income_12month_amt", 
"savings_avg_bro_1m", 
"cnt_save_5y", 
"savings_sum_bro_6m", 
"dep_avg_term_active", 
"current_work_experience_nm_12м+", 
"savings_sum_dep_3m", 
"max_term", 
"zp_inputs_all_24month", 
"savings_sum_dep_debet_3m", 
"savings_sum_dep_debet_9m", 
"sum_acc_now", 
"max_amt_dep_6m", 
"savings_sum_oms_2m", 
"savings_trend_3m", 
"max_dep_avg_balance_12month_amt_term_savings", 
"savings_sum_bro_12m", 
"cnt_most_popular_mcc_9222_6m", 
"max_amt_foreign_cur_5y", 
"max_max_dep_balance_amt", 
"balance_change_6m", 
"avg_dep_avg_balance_fact_6month_amt_term", 
"min_amt_term_g1y", 
"salary_flg_SZP", 
"avg_dep_avg_balance_3month_amt_term", 
"savings_sum_bro_debet_9m", 
"avg_sum_cls2op", 
"avg_balance_6m", 
"savings_sum_dep_debet_12m", 
"avg_dep_avg_balance_fact_3month_amt_term", 
"min_max_dep_balance_amt_term", 
"savings_sum_dep_9m", 
"avg_dep_avg_balance_fact_12month_amt_term", 
"cnt_most_popular_mcc_9222_12m", 
"avg_dep_avg_balance_12month_amt_term_savings", 
"cnt_prolong_

Обработка данных после анализа

In [None]:
#Признаки для дальнейшего их выброса из датасета(отобрали по методу feature_importance)
notcool_features = [
    'income_verified',
    'zp_payments_1month',
    'zp_flag_12month',
    'savings_safe_acc_flg',
    'zp_flag_18month',
    'zp_flag_24month',
    'zp_flag_3month',
    'zp_flag_6month',
    'zp_flag_9month',
    'cnt_manage_5y',
    'app_real_estate_ind',
    'cnt_account_5y',
    'app_vehicle_ind',
    'vehicle_counrty_type_nm',
    'max_amt_foreign_cur_5y',
    'max_amt_dep_6m',
    'min_amt_term_g1y',
    "zp_first_month", "zp_mean_12month", "zp_mean_24month", "sum_amount_zp_12m"
]
num_features = list(train.select_dtypes(include=['number']).columns.drop(notcool_features))


#Финальная модель

Обучение модели StackingRegressor

In [None]:
# Импорт модели
from sklearn.ensemble import StackingRegressor

# Выборка для обучения
X_train = train[num_features]
y_train = np.log1p(target['target'])

# Параметры для модели
estimators = [
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror',
    n_estimators=120,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    tree_method='hist',
    n_jobs=-1)),
    ('cat', CatBoostRegressor(verbose=False))
]

model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())

# Обучение модели
model.fit(X_train, y_train)

# Предсказания модели на тестовой выборке
test_predict = model.predict(test[num_features])

# Перевод результата предсказания в экспаненциальный вид
y_pred = np.expm1(test_predict)
y_pred = np.clip(y_pred, 0, None)

# Сохранение в csv формате
submission = pd.DataFrame()
submission['id'] = test['id']
submission['target'] = y_pred
submission.to_csv('submission.csv', index=False)