In [1]:
import joblib
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
import catboost as cb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_auc_score, log_loss, roc_curve

import os

Данные

In [67]:
train_mortgage = pd.read_csv('/Users/veronika/customers-propensity/Data/splited-data/train_Mortgage.csv')
val_mortgage = pd.read_csv('/Users/veronika/customers-propensity/Data/splited-data/val_Mortgage.csv')

train_pension = pd.read_csv('/Users/veronika/customers-propensity/Data/splited-data/train_Pension.csv')
val_pension = pd.read_csv('/Users/veronika/customers-propensity/Data/splited-data/val_Pension.csv')

train_savings = pd.read_csv('/Users/veronika/customers-propensity/Data/splited-data/train_Savings.csv')
val_savings = pd.read_csv('/Users/veronika/customers-propensity/Data/splited-data/val_Savings.csv')

train_targets = ['Mortgage', 'Pension', 'Savings']
train_data = [train_mortgage, train_pension, train_savings]
val_data = [val_mortgage, val_pension, val_savings]

In [4]:
def save_model_and_calibrator(model, calibrator, name):
    joblib.dump(model, f'{name}_model.pkl')
    joblib.dump(calibrator, f'{name}_platt.pkl')

def train_and_calibrate(X_train, y_train, X_calib, y_calib, model_type):
    if model_type == 'LightGBM':
        model = LGBMClassifier(
            n_estimators=1500, max_depth=10, learning_rate=0.005, reg_lambda=0.01
        )
    elif model_type == 'ExtraTrees':
        model = ExtraTreesClassifier(
            n_estimators=200, max_depth=10, min_samples_split=2, min_samples_leaf=5
        )
    elif model_type == 'CatBoost':
        model = cb.CatBoostClassifier(
            iterations=1000, depth=9, learning_rate=0.005, l2_leaf_reg=0.01, verbose=0
        )
    else:
        raise ValueError(f'Неизвестный тип модели: {model_type}')

    model.fit(X_train, y_train)

    calibrator = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrator.fit(X_calib, y_calib)

    return model, calibrator

def run_final_pipeline(train_data, val_data):
    targets = ['Mortgage', 'Pension', 'Savings']
    model_choices = {
        'Mortgage': 'LightGBM',
        'Pension': 'ExtraTrees',
        'Savings': 'ExtraTrees'
    }

    for train_df, val_df, target in zip(train_data, val_data, targets):
        print(f'====== {target} ======')

        X_train = train_df.drop(columns=[target])
        y_train = train_df[target]
        X_val = val_df.drop(columns=[target])
        y_val = val_df[target]

        X_calib, X_test, y_calib, y_test = train_test_split(
            X_val, y_val, test_size=0.5, random_state=42, stratify=y_val
        )

        model_type = model_choices[target]

        model, calibrator = train_and_calibrate(X_train, y_train, X_calib, y_calib, model_type)

        save_model_and_calibrator(model, calibrator, target.lower())

#### Для отчета по калибровке

In [68]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

In [69]:
model_mortgage = joblib.load('mortgage_model.pkl')
calibrator_mortgage = joblib.load('mortgage_platt.pkl')

model_pension = joblib.load('pension_model.pkl')
calibrator_pension = joblib.load('pension_platt.pkl')

model_savings = joblib.load('savings_model.pkl')
calibrator_savings = joblib.load('savings_platt.pkl')

Калибровочные кливые до/после

In [19]:
def save_calibration_plots(models, calibrators, val_data, product_names, output_dir='calibration_plots'):
    os.makedirs(output_dir, exist_ok=True)
    
    for model, calibrator, val_df, product in zip(models, calibrators, val_data, product_names):
        X_val = val_df.drop(columns=[product])
        y_val = val_df[product]
        
        X_calib, X_test, y_calib, y_test = train_test_split(
            X_val, y_val, test_size=0.5, random_state=42, stratify=y_val
        )
        
        proba_uncalibrated = model.predict_proba(X_test)[:, 1]
        proba_calibrated = calibrator.predict_proba(X_test)[:, 1]
        
        frac_pos_uncal, mean_pred_uncal = calibration_curve(y_test, proba_uncalibrated, n_bins=10, strategy='quantile')
        frac_pos_cal, mean_pred_cal = calibration_curve(y_test, proba_calibrated, n_bins=10, strategy='quantile')
        
        auc_uncal = roc_auc_score(y_test, proba_uncalibrated)
        auc_cal = roc_auc_score(y_test, proba_calibrated)
        logloss_uncal = log_loss(y_test, proba_uncalibrated)
        logloss_cal = log_loss(y_test, proba_calibrated)
        
        plt.figure(figsize=(8, 6))
        plt.plot(mean_pred_uncal, frac_pos_uncal, 'b-s', 
                label=f'До калибровки (AUC={auc_uncal:.3f}, LogLoss={logloss_uncal:.3f})')
        plt.plot(mean_pred_cal, frac_pos_cal, 'r-o', 
                label=f'После калибровки (AUC={auc_cal:.3f}, LogLoss={logloss_cal:.3f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Идеальная калибровка')
        
        plt.title(f'Калибровка для {product}')
        plt.xlabel('Среднее предсказание вероятности')
        plt.ylabel('Доля положительных классов')
        plt.legend(loc='upper left')
        plt.grid(True)
        
        plot_path = os.path.join(output_dir, f'calibration_{product.lower()}.png')
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f'График сохранен: {plot_path}')


In [20]:
save_calibration_plots(
    models=[model_mortgage, model_pension, model_savings],
    calibrators=[calibrator_mortgage, calibrator_pension, calibrator_savings],
    val_data=val_data,
    product_names=['Mortgage', 'Pension', 'Savings'],
    output_dir='calibration_plots'
)

График сохранен: calibration_plots/calibration_mortgage.png
График сохранен: calibration_plots/calibration_pension.png
График сохранен: calibration_plots/calibration_savings.png


Метрики до/после

In [36]:
def save_metrics_plots(models, calibrators, val_data, product_names, output_dir='metrics_plots'):
    os.makedirs(output_dir, exist_ok=True)
    
    for model, calibrator, val_df, product in zip(models, calibrators, val_data, product_names):
        X_val = val_df.drop(columns=[product])
        y_val = val_df[product]
        
        X_calib, X_test, y_calib, y_test = train_test_split(
            X_val, y_val, test_size=0.5, random_state=42, stratify=y_val
        )
        
        proba_uncalibrated = model.predict_proba(X_test)[:, 1]
        proba_calibrated = calibrator.predict_proba(X_test)[:, 1]
        
        auc_uncal = roc_auc_score(y_test, proba_uncalibrated)
        auc_cal = roc_auc_score(y_test, proba_calibrated)
        logloss_uncal = log_loss(y_test, proba_uncalibrated)
        logloss_cal = log_loss(y_test, proba_calibrated)
        
        fpr_uncal, tpr_uncal, _ = roc_curve(y_test, proba_uncalibrated)
        fpr_cal, tpr_cal, _ = roc_curve(y_test, proba_calibrated)
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        ax1.plot(fpr_uncal, tpr_uncal, 'b-', label=f'До калибровки (AUC={auc_uncal:.3f})', lw=2)
        ax1.plot(fpr_cal, tpr_cal, 'r-', label=f'После калибровки (AUC={auc_cal:.3f})')
        ax1.plot([0, 1], [0, 1], 'k--')
        ax1.set_xlabel('False Positive Rate')
        ax1.set_ylabel('True Positive Rate')
        ax1.set_title(f'ROC-кривая ({product})')
        ax1.legend(loc='lower right')
        ax1.grid(True)
        
        metrics = ['До калибровки', 'После калибровки']
        logloss_values = [logloss_uncal, logloss_cal]
        colors = ['blue', 'red']
        
        bars = ax2.bar(metrics, logloss_values, color=colors, zorder=2)
        ax2.set_title(f'LogLoss ({product})')
        ax2.set_ylabel('LogLoss')
        ax2.grid(True, zorder=1)
        
        for bar in bars:
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.3f}',
                    ha='center', va='bottom')
        
        plt.suptitle(f'Сравнение метрик для {product}', y=1.02)
        plt.tight_layout()
        
        plot_path = os.path.join(output_dir, f'metrics_comparison_{product.lower()}.png')
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f'График сохранен: {plot_path}')

In [37]:
save_metrics_plots(
    models=[model_mortgage, model_pension, model_savings],
    calibrators=[calibrator_mortgage, calibrator_pension, calibrator_savings],
    val_data=val_data,
    product_names=['Mortgage', 'Pension', 'Savings'],
    output_dir='metrics_comparison_plots'
)

График сохранен: metrics_comparison_plots/metrics_comparison_mortgage.png
График сохранен: metrics_comparison_plots/metrics_comparison_pension.png
График сохранен: metrics_comparison_plots/metrics_comparison_savings.png


Создаем pdf

In [2]:
import os
from fpdf import FPDF
from PIL import Image
import matplotlib.image as mpimg



In [27]:
class PDF(FPDF):
    def __init__(self):
        super().__init__()
        self.add_font("DejaVu", "", "/Users/veronika/Downloads/font/DejaVuSansCondensed.ttf")
        self.add_font("DejaVu", "B", "/Users/veronika/Downloads/font/DejaVuSansCondensed-Bold.ttf")
        self.set_font("DejaVu", "", 12)
        self.first_page = True  # Флаг для первой страницы
    
    def header(self):
        if self.first_page:
            self.set_font("DejaVu", "B", 16)
            self.cell(0, 10, "Результаты калибровки и выбор моделей", 0, 1, "C")
            self.ln(10)
            self.first_page = False  # Убираем заголовок на следующих страницах
    
    def add_section_title(self, title):
        self.set_font("DejaVu", "B", 14)
        self.cell(0, 10, title, 0, 1)
        self.ln(5)
    
    def add_table(self, data):
        col_widths = [30, 40, 50, 20, 20]  # Подобраны для хорошего отображения
        self.set_font("DejaVu", "B", 12)
        headers = ["Продукт", "Лучшая модель", "Метод калибровки", "AUC", "LogLoss"]
        
        for header, width in zip(headers, col_widths):
            self.cell(width, 10, header, border=1)
        self.ln()
        
        self.set_font("DejaVu", "", 10)
        for row in data:
            height = 10
            if len(row["Лучшая модель"]) > 15 or len(row["Метод калибровки"]) > 15:
                height = 15
            
            self.cell(col_widths[0], height, row["Продукт"], border=1)
            self.cell(col_widths[1], height, row["Лучшая модель"], border=1)
            self.cell(col_widths[2], height, row["Метод калибровки"], border=1)
            self.cell(col_widths[3], height, str(row["AUC"]), border=1)
            self.cell(col_widths[4], height, str(row["LogLoss"]), border=1)
            self.ln()
        
        self.ln(10)
    
    def add_recommendations(self):
        self.set_font("DejaVu", "B", 12)
        self.cell(0, 10, "Рекомендации по выбору моделей:", 0, 1)
        self.ln(5)
        
        self.set_font("DejaVu", "", 10)
        
        recommendations = [
            ("Mortgage", "Оптимальный выбор — LightGBM с методом калибровки Platt Scaling "
                        "\n- Эта модель показывает наивысший AUC (~0.948) и лучшее значение LogLoss (0.325)"),
            ("Pension", "Оптимальный выбор — ExtraTrees с методом Platt Scaling"
                        "\n- Эта модель обеспечивает наилучший результат по AUC (~0.796), а калибровка снижает LogLoss на 20% (0.544 после калибровки)"),
            ("Savings", "Оптимальный выбор — ExtraTrees с методом Platt Scaling"
                        "\n- Все модели показывают схожие результаты, но ExtraTrees дает наименьший LogLoss (0.628)")
        ]
        
        for i, (product, text) in enumerate(recommendations, 1):
            self.set_font("DejaVu", "B", 12)
            self.cell(0, 10, f"{i}. {product}", 0, 1)
            self.set_font("DejaVu", "", 12)
            self.multi_cell(0, 8, f"- {text}")
            self.ln(3)
        
        # Общие рекомендации
        self.ln(5)
        self.set_font("DejaVu", "", 12)
        self.multi_cell(0, 8, "Для всех продуктов использование Platt Scaling оказывает позитивное влияние на LogLoss и сохраняет значение ROC-AUC (следовательно, предсказания становятся более уверенными)")
        self.ln(10)


    def add_image_pair(self, calib_path, metrics_path, product_name):
        self.add_page()  # Добавляем новую страницу для каждого продукта
        self.set_font("DejaVu", "B", 12)
        self.cell(0, 10, f"Продукт: {product_name}", 0, 1)
        self.ln(5)
        
        if os.path.exists(calib_path):
            self.set_font("DejaVu", "", 10)
            self.cell(0, 10, "Калибровочные кривые:", 0, 1)
            self.image(calib_path, x=(self.w - self.w * 0.7) / 2, w=self.w * 0.7)  # Центрируем изображение и устанавливаем ширину 70% страницы
            self.ln(5)
        
        if os.path.exists(metrics_path):
            self.image(metrics_path, x=(self.w - self.w * 0.95) / 2, w=self.w * 0.95)  # Центрируем изображение и устанавливаем ширину 70% страницы
            self.ln(15)

def create_pdf_report():
    results = [
        {
            "Продукт": "Mortgage",
            "Лучшая модель": "LightGBM",
            "Метод калибровки": "Platt Scaling",
            "AUC": 0.948,
            "LogLoss": 0.325,
        },
        {
            "Продукт": "Pension",
            "Лучшая модель": "ExtraTrees",
            "Метод калибровки": "Platt Scaling",
            "AUC": 0.796,
            "LogLoss": 0.589,
        },
        {
            "Продукт": "Savings",
            "Лучшая модель": "ExtraTrees",
            "Метод калибровки": "Platt Scaling",
            "AUC": 0.699,
            "LogLoss": 0.628,
        }
    ]
    
    calib_plots = {
        "Mortgage": "calibration_plots/calibration_mortgage.png",
        "Pension": "calibration_plots/calibration_pension.png",
        "Savings": "calibration_plots/calibration_savings.png"
    }
    
    metrics_plots = {
        "Mortgage": "metrics_comparison_plots/metrics_comparison_mortgage.png",
        "Pension": "metrics_comparison_plots/metrics_comparison_pension.png",
        "Savings": "metrics_comparison_plots/metrics_comparison_savings.png"
    }
    
    pdf = PDF()
    pdf.add_page()
    
    # Добавляем таблицу
    pdf.add_table(results)
    
    # Добавляем рекомендации под таблицей
    pdf.add_recommendations()
    
    # Добавляем изображения
    for product in ["Mortgage", "Pension", "Savings"]:
        calib_path = calib_plots[product]
        metrics_path = metrics_plots[product]
        
        if not os.path.exists(calib_path):
            print(f"Предупреждение: файл {calib_path} не найден")
            calib_path = None
        if not os.path.exists(metrics_path):
            print(f"Предупреждение: файл {metrics_path} не найден")
            metrics_path = None
        
        if calib_path or metrics_path:
            pdf.add_image_pair(calib_path, metrics_path, product)
    
    output_path = "models_selection_report.pdf"
    pdf.output(output_path)
    print(f"Отчет успешно сохранен: {output_path}")

In [28]:
create_pdf_report()

  self.cell(0, 10, "Результаты калибровки и выбор моделей", 0, 1, "C")
  self.cell(0, 10, "Рекомендации по выбору моделей:", 0, 1)
  self.cell(0, 10, f"{i}. {product}", 0, 1)
  self.cell(0, 10, f"Продукт: {product_name}", 0, 1)
  self.cell(0, 10, "Калибровочные кривые:", 0, 1)


Отчет успешно сохранен: models_selection_report.pdf
