# ============Необходимые библиотеки и фреймворки============

In [8]:
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

import mlflow
import mlflow.pytorch

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, confusion_matrix

# ================0. КОНФИГУРАЦИЯ MLFLOW================

In [9]:
DATA_PATH = Path("ML test task v3")
ARTIFACTS_PATH = Path("saved_models")
ARTIFACTS_PATH.mkdir(exist_ok=True)

# Настройка MLflow
mlflow.set_experiment("Customer_Class_NN_Aggregate")
print(f"MLflow Experiment: {mlflow.get_experiment_by_name('Customer_Class_NN_Aggregated').name}")

MLflow Experiment: Customer_Class_NN_Aggregated


# ================1. ЗАГРУЗКА И ПОДГОТОВКА ДАННЫХ================

In [10]:
print("\n--- 1. Загрузка и подготовка данных ---")

# --- Загрузка исходных данных ---
macro_df = pd.read_csv(DATA_PATH / "context_df.csv")
contracts_df = pd.read_parquet(DATA_PATH / "test_task.parquet")

# --- Предобработка макроэкономической таблицы ---
macro_df.columns = macro_df.columns.str.lower().str.replace(' ', '_')
macro_df['context_data_from'] = pd.to_datetime(macro_df['context_data_from'])
percent_cols = ['inflation', 'key_rate', 'deposit_1', 'deposit_3', 'deposit_6', 'deposit_12', 'fa_delta', 'usd_delta', 'imoex_delta', 'rgbi_delta']
for col in percent_cols:
    macro_df[col] = pd.to_numeric(macro_df[col].str.replace('%', ''), errors='coerce')

macro_df.ffill(inplace=True)
macro_df.bfill(inplace=True)

# --- Предобработка таблицы контрактов ---
contracts_df.rename(columns={'Договор Дата Заключения': 'contract_date'}, inplace=True)
contracts_df['cus_class'] = contracts_df['cus_class'].astype(int)

# --- Объединение таблиц по дате ---
merged_df = pd.merge_asof(
    contracts_df.sort_values('contract_date'),
    macro_df.sort_values('context_data_from'),
    left_on='contract_date',
    right_on='context_data_from',
    direction='backward'
).dropna(subset=macro_df.columns)

# --- Создание временных признаков ---
merged_df['day_of_year'] = merged_df['contract_date'].dt.dayofyear
merged_df['day_of_week'] = merged_df['contract_date'].dt.dayofweek
merged_df['month'] = merged_df['contract_date'].dt.month

# --- Агрегация классов в 3 группы ---
def aggregate_cus_class(c):
    if c in [1, 5, 8, 10, 4]: return 0  # Группа 'Base'
    if c in [101, 102, 103, 104, 105, 106, 107, 108, 109]: return 1  # Группа 'Premium'
    return 2  # Группа 'Rare'

merged_df['cus_class_agg'] = merged_df['cus_class'].apply(aggregate_cus_class)
print("Распределение классов после агрегации:\n", merged_df['cus_class_agg'].value_counts(normalize=True))

# --- Финальное формирование X и y ---
features = [
    'quarter', 'inflation', 'key_rate', 'deposit_1', 'deposit_3', 'deposit_6',
    'deposit_12', 'fa_delta', 'usd_delta', 'imoex_delta', 'rgbi_delta',
    'day_of_year', 'day_of_week', 'month'
]
X = merged_df[features]
y = merged_df['cus_class_agg']


--- 1. Загрузка и подготовка данных ---
Распределение классов после агрегации:
 cus_class_agg
0    0.709510
1    0.270753
2    0.019737
Name: proportion, dtype: float64


# ===========2. ПОДГОТОВКА ВЫБОРКИ ДЛЯ ОБУЧЕНИЯ=============

In [11]:
print("\n--- 2. Подготовка выборок для обучения ---")

# --- Разделение на обучающую и тестовую выборки ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Масштабирование признаков ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Преобразование в тензоры PyTorch ---
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# --- Создание DataLoader'ов ---
BATCH_SIZE = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)



--- 2. Подготовка выборок для обучения ---


# =========3. ПОДГОТОВКА МОДЕЛИ================

In [12]:
class Classifier(nn.Module):
    """Простая полносвязная нейронная сеть для классификации."""
    def __init__(self, num_features, num_classes):
        super(Classifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, num_classes)
        )
    def forward(self, x):
        return self.network(x)

# --- Параметры обучения ---
N_EPOCHS = 100
LEARNING_RATE = 0.001
NUM_FEATURES = X_train.shape[1]
NUM_CLASSES = len(y.unique())
class_names = ['Base', 'Premium', 'Rare']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Расчет весов для борьбы с дисбалансом классов ---
class_counts = np.bincount(y_train)
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float32)
class_weights = class_weights.to(device)

# --- Инициализация модели и компонентов для обучения ---
model = Classifier(num_features=NUM_FEATURES, num_classes=NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

# =========4. ВСПОМОГАТЕЛЬНАЯ ФУНКЦИЯ ДЛЯ ОЦЕНКИ================

In [13]:
def evaluate_and_log_pytorch(model, data_loader, device, class_names):
    """Оценивает модель, выводит метрики, логирует их и матрицу ошибок в MLflow."""
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in data_loader:
            outputs = model(inputs.to(device))
            all_preds.extend(torch.max(outputs, 1)[1].cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\n--- Финальная оценка модели ---")
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names, zero_division=0))

    f1_macro = f1_score(all_labels, all_preds, average='macro')
    mlflow.log_metric("final_test_f1_macro", f1_macro)
    
    # --- Матрица ошибок ---
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    cm_path = "confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()
    print(f"Матрица ошибок сохранена и залогирована в MLflow как '{cm_path}'.")

# =========5. ЗАПУСК ОБУЧЕНИЯ================

In [14]:
print(f"\n--- 5. Запуск обучения на устройстве: {device} ---")

with mlflow.start_run() as run:
    mlflow.log_params({"epochs": N_EPOCHS, "learning_rate": LEARNING_RATE, "batch_size": BATCH_SIZE})
    
    for epoch in range(N_EPOCHS):
        model.train()
        for inputs, labels in train_loader:
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Валидация в процессе обучения
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs.to(device))
                val_loss += criterion(outputs, labels.to(device)).item()
        
        avg_val_loss = val_loss / len(test_loader)
        scheduler.step(avg_val_loss)

        if (epoch + 1) % 10 == 0:
            print(f"Эпоха [{epoch+1}/{N_EPOCHS}], Validation Loss: {avg_val_loss:.4f}")
        mlflow.log_metric("validation_loss", avg_val_loss, step=epoch)

    # --- Финальная оценка и сохранение артефактов ---
    evaluate_and_log_pytorch(model, test_loader, device, class_names)
    
    with open(ARTIFACTS_PATH / "scaler.pkl", "wb") as f: pickle.dump(scaler, f)
    class_mapping = {i: name for i, name in enumerate(class_names)}
    with open(ARTIFACTS_PATH / "class_mapping.json", "w") as f: json.dump(class_mapping, f)

    mlflow.pytorch.log_model(model, "model_pytorch")
    mlflow.log_artifact(str(ARTIFACTS_PATH / "scaler.pkl"))
    mlflow.log_artifact(str(ARTIFACTS_PATH / "class_mapping.json"))
    
    print("\nОбучение завершено. Модель и артефакты сохранены в MLflow.")


--- 5. Запуск обучения на устройстве: cpu ---
Эпоха [10/100], Validation Loss: 0.9451
Эпоха [20/100], Validation Loss: 0.9491
Эпоха [30/100], Validation Loss: 0.9462
Эпоха [40/100], Validation Loss: 0.9493
Эпоха [50/100], Validation Loss: 0.9471
Эпоха [60/100], Validation Loss: 0.9520
Эпоха [70/100], Validation Loss: 0.9449
Эпоха [80/100], Validation Loss: 0.9510
Эпоха [90/100], Validation Loss: 0.9524
Эпоха [100/100], Validation Loss: 0.9532

--- Финальная оценка модели ---

Classification Report:
              precision    recall  f1-score   support

        Base       0.84      0.75      0.79      2934
     Premium       0.57      0.49      0.53      1119
        Rare       0.07      0.49      0.12        82

    accuracy                           0.67      4135
   macro avg       0.49      0.57      0.48      4135
weighted avg       0.75      0.67      0.70      4135

Матрица ошибок сохранена и залогирована в MLflow как 'confusion_matrix.png'.





Обучение завершено. Модель и артефакты сохранены в MLflow.
