Эксперименты, Пайплайн обработки данных, Построение модели
-

Baseline: чистим, кодируем, нормируем, обучаем, оцениваем.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle
#from category_encoders import CatBoostEncoder

df = pd.read_csv('dataset/train_ver2.csv')
df = df.drop(['tipodom'], axis=1)
df_last_month = df[df['fecha_dato'] == "2016-05-28"]
products = [x for x in df.columns if '_ult1' in x]

def baseline_data_transform(df: pd.DataFrame, keep_user_ids=False):
    
    def remove_outliers(data: pd.DataFrame, outliers_columns: list):
        num_cols = outliers_columns
        threshold = 1.5
        #potential_outliers = pd.DataFrame()

        for col in num_cols:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            margin = threshold * IQR
            lower = Q1 - margin
            upper = Q3 + margin
            #potential_outliers[col] = ~data[col].between(lower, upper)
            data = data[data[col].between(lower, upper)]
        #return potential_outliers
        return data

    # берём только последний месяц
    df = df[df['fecha_dato'] == "2016-05-28"]

    # удалить столбцы
    if keep_user_ids:
        df = df.drop(['fecha_dato', 'ult_fec_cli_1t', 'conyuemp'], axis=1)
    else:
        df = df.drop(['fecha_dato', 'ncodpers', 'ult_fec_cli_1t', 'conyuemp'], axis=1)

    # удалить строки c пропусками
    for col in df.columns.to_list():
        if col not in ['renta']:
            df = df[df[col].notna()]
    
    # заполнить пропуски в ренте нулями
    df['renta'] = df['renta'].fillna(0.0)

    # age строки перевести в int, почистить от ' NA'
    #TODO
    df['age'] = df['age'].astype(int)

    # fecha_alta перевести в int число секунд с начала эпохи
    df['fecha_alta'] = pd.to_datetime(df['fecha_alta']).astype(int) / 10**9
    
    # antiguedad строки конвертировать в int, и почистить от значений типа '     NA', -999999.
    #TODO
    df = df[~df['antiguedad'] < 0]
    df['antiguedad'] = df['antiguedad'].astype(int)

    # indrel_1mes намешаны разные типы, надо привести к одному типу 2.0 '2.0' '2' итп
    #TODO
    # решил в baseline удалить столбец
    df = df.drop(['indrel_1mes'], axis=1)

    # убираем выбросы
    df = remove_outliers(df, ['age', 'renta', 'fecha_alta'])
    
    return df

data = baseline_data_transform(df_last_month)

  df = pd.read_csv('dataset/train_ver2.csv')


In [2]:
X_tr, X_val, y_tr, y_val = train_test_split(
    data.drop(products, axis=1),
    data[products],
    #stratify=data[products],
    random_state=42
)

categ_binary_vals = ['sexo', 'ind_nuevo', 'indrel', 'indresi', 'indext', 'indfall', 'ind_actividad_cliente']
categ_vals = ['ind_empleado', 'pais_residencia', 'tiprel_1mes', 'canal_entrada', 'cod_prov', 'nomprov', 'segmento'] # indrel_1mes
numeric_vals = ['age', 'fecha_alta', 'antiguedad', 'renta']

# категориальные признаки решил закодировать через признак renta чтобы не было утечки
def encode_cat_via_renta(df: pd.DataFrame):
    global_mean = df['renta'].mean()
    alpha = 100  # Чем больше alpha, тем сильнее сглаживание к global_mean
    for cat in categ_vals:
        # Формула: (mean_in_category * n_samples + global_mean * alpha) / (n_samples + alpha)
        category_stats = df.groupby(cat)['renta'].agg(['mean', 'count'])
        encoded_values = (
            (category_stats['mean'] * category_stats['count'] + global_mean * alpha) / 
            (category_stats['count'] + alpha)
        )
        # Заменяем категориальные значения на закодированные
        df[cat] = df[cat].map(encoded_values)
    scaler = StandardScaler()
    # нормируем
    df[categ_vals] = scaler.fit_transform(df[categ_vals])
    return df

X_tr = encode_cat_via_renta(X_tr)
X_val = encode_cat_via_renta(X_val)

preprocessor = ColumnTransformer([
        ('binary', OneHotEncoder(drop='if_binary'), categ_binary_vals),
        #('cat', ?, categ_vals),
        ('num', StandardScaler(), numeric_vals),
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

X_tr_transformed = preprocessor.fit_transform(X_tr, y_tr)
X_val_transformed = preprocessor.transform(X_val)

In [59]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = MultiOutputClassifier(RandomForestClassifier(
    n_estimators=50,  # Уменьшите, если долго (начните с 50)
    verbose=1,         # Вывод лога обучения
    n_jobs=-1          # Использовать все ядра CPU
))
model.fit(X_tr_transformed, y_tr)

predictions = model.predict(X_val_transformed)
print(classification_report(y_val, predictions))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.4s
[

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.00      0.00      0.00         4
           2       0.78      0.81      0.79    132622
           3       0.00      0.00      0.00        71
           4       0.42      0.12      0.18     17118
           5       0.94      0.92      0.93      1784
           6       0.51      0.26      0.34      1858
           7       0.55      0.35      0.43     22299
           8       0.51      0.16      0.24      7582
           9       0.00      0.00      0.00        70
          10       0.00      0.00      0.00       242
          11       0.55      0.27      0.37      7327
          12       0.55      0.24      0.34     17592
          13       0.34      0.03      0.05      3341
          14       0.47      0.01      0.01      1055
          15       0.31      0.01      0.02      1638
          16       0.75      0.01      0.03       454
          17       0.33    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Смотрим f1-score:
- macro avg = 0.19 - модель плохо справляется с большинством классов.
- weighted avg = 0.50 - с учетом количества в каждом классе модель справляется лучше на классах покрупнее.

In [None]:
import pickle

# Сохраняем
with open('model_baseline_RandomForestClassification.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
import pickle
# Загружаем
with open('model_baseline_RandomForestClassification.pkl', 'rb') as f:
    model = pickle.load(f)

Залогируем baseline модель

In [None]:
import mlflow
from dotenv import load_dotenv
import os

load_dotenv()
mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_registry_uri(mlflow_tracking_uri)

EXPERIMENT_NAME = "spain_bank"
RUN_NAME = "baseline"

metrics = {'f1-score macro avg': 0.19,
           'f1-score weighted avg': 0.50}
signature = mlflow.models.infer_signature(X_val_transformed, predictions)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
if not experiment_id:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) 

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    # model_info = mlflow.sklearn.log_model(
    #     sk_model=model, 
    #     artifact_path='models',
    #     await_registration_for=60,
    #     signature=signature,
    #     registered_model_name='model_spain_bank_baseline',
    #     code_paths=['experiments.ipynb'],
    #     pip_requirements='requirements.txt'
    # )
    
    # к сожалению воркер виснет при логировании модели.
    # сохраним только метрики, а модель если что останется файлом .pkl
    mlflow.log_metrics(metrics)

🏃 View run baseline at: http://127.0.0.1:5000/#/experiments/1/runs/e8019caba6c544488ecbb8c8aa68584b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Модель вышла размером 4.7 ГБ, и одно предсказание выполняется долго.

Обучим другую полегче:

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

model = MultiOutputClassifier(
    LogisticRegression(max_iter=1000, solver="lbfgs", n_jobs=-1),
    n_jobs=-1
)
model.fit(X_tr_transformed, y_tr)

predictions = model.predict(X_val_transformed)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.00      0.00      0.00         4
           2       0.74      0.82      0.78    132622
           3       0.00      0.00      0.00        71
           4       0.00      0.00      0.00     17118
           5       0.86      0.87      0.87      1784
           6       0.29      0.00      0.01      1858
           7       0.37      0.16      0.22     22299
           8       0.00      0.00      0.00      7582
           9       0.00      0.00      0.00        70
          10       0.00      0.00      0.00       242
          11       0.59      0.23      0.33      7327
          12       0.53      0.19      0.27     17592
          13       1.00      0.00      0.00      3341
          14       0.00      0.00      0.00      1055
          15       0.00      0.00      0.00      1638
          16       0.00      0.00      0.00       454
          17       0.47    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# Сохраняем
with open('model_baseline_LogReg.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Загружаем
with open('model_baseline_LogReg.pkl', 'rb') as f:
    model = pickle.load(f)

In [6]:
import mlflow
from dotenv import load_dotenv
import os

load_dotenv()
mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_registry_uri(mlflow_tracking_uri)

EXPERIMENT_NAME = "spain_bank"
RUN_NAME = "baseline_logreg"

metrics = {'f1-score macro avg': 0.11,
           'f1-score weighted avg': 0.41}
signature = mlflow.models.infer_signature(X_val_transformed, predictions)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
if not experiment_id:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) 

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.sklearn.log_model(
        sk_model=model, 
        artifact_path='models',
        await_registration_for=60,
        signature=signature,
        registered_model_name='model_spain_bank_baseline_logreg',
        code_paths=['experiments.ipynb'],
        pip_requirements='requirements.txt'
    )

    mlflow.log_metrics(metrics)

Successfully registered model 'model_spain_bank_baseline_logreg'.
2025/07/28 12:30:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: model_spain_bank_baseline_logreg, version 1
Created version '1' of model 'model_spain_bank_baseline_logreg'.


🏃 View run baseline_logreg at: http://127.0.0.1:5000/#/experiments/1/runs/aa24b8c063c3471eb1c0eaeef2dde3b0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Качество чуть хуже, но модель гораздо легче и быстрее.

In [None]:
# в продакшене понадобится информация для предсказаний и информативного вывода
def pack_prod_obj():
    products_spanish_to_ru_dict = {
        'ind_ahor_fin_ult1':	'Сберегательный счёт',
        'ind_aval_fin_ult1':	'Банковская гарантия',
        'ind_cco_fin_ult1':	'Текущие счета',
        'ind_cder_fin_ult1':	'Деривативный счёт',
        'ind_cno_fin_ult1':	'Зарплатный проект',
        'ind_ctju_fin_ult1':	'Детский счёт',
        'ind_ctma_fin_ult1':	'Особый счёт 3',
        'ind_ctop_fin_ult1':	'Особый счёт',
        'ind_ctpp_fin_ult1':	'Особый счёт 2',
        'ind_deco_fin_ult1':	'Краткосрочный депозит',
        'ind_deme_fin_ult1':	'Среднесрочный депозит',
        'ind_dela_fin_ult1':	'Долгосрочный депозит',
        'ind_ecue_fin_ult1':	'Цифровой счёт',
        'ind_fond_fin_ult1':	'Денежный средства',
        'ind_hip_fin_ult1':	'Ипотека',
        'ind_plan_fin_ult1':	'Пенсионный план',
        'ind_pres_fin_ult1':	'Кредит',
        'ind_reca_fin_ult1':	'Налоговый счёт',
        'ind_tjcr_fin_ult1':	'Кредитная карта',
        'ind_valo_fin_ult1':	'Ценные бумаги',
        'ind_viv_fin_ult1':	'Домашний счёт',
        'ind_nomina_ult1':	'Аккаунт для выплаты зарплаты',
        'ind_nom_pens_ult1':	'Аккаунт для пенсионных обязательств',
        'ind_recibo_ult1':	'Дебетовый аккаунт',
    }
    data_for_user_ids = baseline_data_transform(df_last_month, keep_user_ids=True)
    user_id_order = data_for_user_ids['ncodpers'].to_list()

    # data.drop(products, axis=1),
    # data[products],

    X_prod = encode_cat_via_renta(data.drop(products, axis=1))

    preprocessor = ColumnTransformer([
            ('binary', OneHotEncoder(drop='if_binary'), categ_binary_vals),
            #('cat', ?, categ_vals),
            ('num', StandardScaler(), numeric_vals),
        ],
        verbose_feature_names_out=False,
        remainder='passthrough'
    )
    X_prod_transformed = preprocessor.fit_transform(X_prod, data[products])

    prod_obj = {
        'products_spanish_to_ru_dict': products_spanish_to_ru_dict,
        'user_id_order': user_id_order,
        'products': products,
        'features': X_prod_transformed,
    }
    return prod_obj

prod_obj = pack_prod_obj()

# Сохраняем
with open('prod_obj.pkl', 'wb') as f:
    pickle.dump(prod_obj, f)

Advanced
-
TODO

Времени на эксперименты почти не осталось, поэтому отправляю текущее решение на проверку.

Эксперименты:
- перестроить таргеты (см. EDA)
- переделать датасет так, чтобы признаки n-го месяца лежали с таргетами n+1-го месяца;
- сгенерировать признак - динамика поля `renta` за последние n месяцев;
- ...

In [None]:
def remove_duplicates(data):
    feature_cols = data.columns.drop('customer_id').tolist()
    is_duplicated_features = data.duplicated(subset=feature_cols, keep=False)
    data = data[~is_duplicated_features].reset_index(drop=True)
    return data

def fill_missing_values(data):

    cols_with_nans = data.isnull().sum()
    cols_with_nans = cols_with_nans[cols_with_nans > 0].index.drop('end_date')

    for col in cols_with_nans:

        if data[col].dtype in [float, int]:
            fill_value = data[col].mean()
        elif data[col].dtype == 'object':
            fill_value = data[col].mode().iloc[0]

        data[col] = data[col].fillna(fill_value)

    return data

def remove_outliers(data):
    num_cols = data.select_dtypes(['float']).columns
    threshold = 1.5
    potential_outliers = pd.DataFrame()

    for col in num_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        margin = threshold * IQR
        lower = Q1 - margin
        upper = Q3 + margin
        potential_outliers[col] = ~data[col].between(lower, upper)
    return potential_outliers