# Импорты

In [1]:
from collections.abc import Iterable
from typing import Any

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from tqdm import tqdm

from src.mba import business, modeling, transforms, vis
from src.mba.const import DATA_PATH, TARGET

# Для более качественных графиков
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 150
%load_ext autoreload
%autoreload 2

# Датасет

### Загрузить в pandas

In [None]:
loan_data = pd.read_csv(DATA_PATH)
loan_data

### Наличие аномалий

In [None]:
vis.plot_anomalies(loan_data, ["сумма"])

### Заменить аномальные значения

In [None]:
# Добавить в пайп модели

clipper = FunctionTransformer(transforms.clip_anomalies, kw_args={"cols": ["сумма"]})
pipe = make_pipeline(clipper)

In [None]:
# Vis
cols = ["сумма"]
vis.plot_anomalies(transforms.clip_anomalies(loan_data, cols), cols)

### Убрать аномальные значения

In [None]:
# В пайпе нельзя юзать

# Vis
cols = ["сумма"]
vis.plot_anomalies(transforms.remove_anomalies(loan_data, cols), cols)

### Построить гистограммы

In [None]:
vis.plot_hist(loan_data, ["сумма"])

### Построить корреляционную матрицу

In [None]:
vis.plot_corr(loan_data, ["сумма", "дефолт"])

### Нормализовать числовые значения

In [None]:
scaler = StandardScaler()
scaler.fit_transform(loan_data[["сумма"]])

### Пропуски

In [None]:
vis.plot_nan(loan_data)

### Заполнить пропуски

In [None]:
imputer = SimpleImputer(strategy="mean")
imputer.fit_transform(loan_data[["сумма"]])

### Уникальные значения в категориальных столбцах

In [None]:
vis.plot_categorical_distributions(loan_data, ["кредитоспособность"])

In [None]:
vis.print_value_counts(loan_data, ["кредитоспособность"])

### Заполнить пропуски в категориальных столбцах

In [None]:
cat_cols = ["кредитоспособность"]
cat_imputer = ColumnTransformer(
    [("imputer", SimpleImputer(strategy="most_frequent"), cat_cols)],
    remainder="passthrough",
)

### Закодировать категориальные признаки при помощи one hot encoding

In [None]:
cat_cols = ["цель"]
cat_encoder = ColumnTransformer(
    [("encoder", OneHotEncoder(), cat_cols)],
    remainder="passthrough",
)

### Закодировать категориальные признаки при помощи label encoding

In [14]:
cat_cols = ["цель"]
cat_encoder = ColumnTransformer(
    [("encoder", OrdinalEncoder(), cat_cols)],
    remainder="passthrough",
)

### Связь категориальных переменных и таргета

In [None]:
vis.visualize_cat_and_target(loan_data, ["кредитоспособность"])

# Модель

### Обучить модель без разделения

In [None]:
def train_log_reg(data: pd.DataFrame) -> None:
    X, y = modeling.get_xy(data)
    float_cols = X.select_dtypes(exclude=object).columns

    pipe = Pipeline(
        [
            ("feat_select", transforms.FeatSelector(float_cols)),
            ("impute", SimpleImputer(strategy="constant", fill_value=0)),
            ("model", LogisticRegression(random_state=42)),
        ]
    )

    pipe.fit(X, y)
    pred = pipe.predict(X)
    modeling.print_metrics(y_true=y, y_pred=pred)


train_log_reg(loan_data)

### Сделать трансформ на столбцах

In [17]:
scaler = ColumnTransformer(
    [
        ("scaler", StandardScaler(), ["сумма"]),
    ],
    remainder="passthrough",
)

### Разделить выборку на обучающее и тестовое множество

In [None]:
# можно юзать и модифицировать modeling.get_train_test для удобства
df_train, df_test = train_test_split(loan_data, test_size=0.33, random_state=42)

### Сделать отчет о классификации

In [None]:
# Модифицируйте modeling.print_metrics

### Разделить выборку на обучающее и тестовое множество со стратификацией

In [19]:
# можно юзать и модифицировать modeling.get_train_test для удобства
df_train, df_test = train_test_split(loan_data, test_size=0.33, random_state=42, stratify=loan_data[TARGET])

### Обучить модель с разделением

In [None]:
def train_model_with_test(data: pd.DataFrame) -> None:
    df_train, df_test = modeling.get_train_test(data)

    X_train, y_train = modeling.get_xy(df_train)
    X_test, y_test = modeling.get_xy(df_test)

    float_cols = X_train.select_dtypes(exclude=object).columns

    pipe = make_pipeline(
        transforms.FeatSelector(float_cols),
        SimpleImputer(strategy="constant", fill_value=0),
        LogisticRegression(random_state=42),
    )
    pipe.fit(X_train, y_train)

    print("Train")
    pred = pipe.predict(X_train)
    modeling.print_metrics(y_pred=pred, y_true=y_train)
    print()

    print("Test")
    pred = pipe.predict(X_test)
    modeling.print_metrics(y_pred=pred, y_true=y_test)
    print()


train_model_with_test(loan_data)

### Обучить модель с настройкой гиперпараметров

In [None]:
def grid_search(data: pd.DataFrame) -> None:
    def get_model(cols: Iterable[str], **kwargs: Any) -> Pipeline:
        return make_pipeline(transforms.FeatSelector(cols), RandomForestClassifier(random_state=42, **kwargs))

    df_train, df_test = modeling.get_train_test(data)
    X_train, y_train = modeling.get_xy(df_train)
    X_test, y_test = modeling.get_xy(df_test)
    float_cols = X_train.select_dtypes(exclude=object).columns

    params = {
        "bootstrap": [True, False],
        # "max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        # "max_features": ["auto", "sqrt"],
        # "min_samples_leaf": [1, 2, 4],
        # "min_samples_split": [2, 5, 10],
        # "n_estimators": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    }

    best_metric = -np.inf
    best_kwargs: dict[str, Any] | None = None

    for kwargs in tqdm(ParameterGrid(params)):
        cur_pipe = get_model(float_cols, **kwargs)
        cur_pipe.fit(X_train, y_train)
        pred = cur_pipe.predict(X_test)
        metric = f1_score(y_true=y_test, y_pred=pred)
        if metric > best_metric:
            best_metric = metric
            best_kwargs = kwargs

    print(f"Best metric: {best_metric:.3f}")
    print(f"Best kwargs: {best_kwargs}")


grid_search(loan_data)

### Обучить модель с настройкой гиперпараметров на кросс вале

In [None]:
def grid_search_cv(data: pd.DataFrame) -> None:
    X, y = modeling.get_xy(data)
    float_cols = X.select_dtypes(exclude=object).columns

    params = {
        "randomforestclassifier__bootstrap": [True, False],
        # "randomforestclassifier__max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        # "randomforestclassifier__max_features": ["auto", "sqrt"],
        # "randomforestclassifier__min_samples_leaf": [1, 2, 4],
        # "randomforestclassifier__min_samples_split": [2, 5, 10],
        # "randomforestclassifier__n_estimators": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    }

    model = make_pipeline(transforms.FeatSelector(float_cols), RandomForestClassifier(random_state=42))
    pipe = GridSearchCV(model, param_grid=params)
    pipe.fit(X, y)
    print("Best params:", pipe.best_params_)
    print("Best score:", pipe.best_score_)


grid_search_cv(loan_data)

### Обучить модель с настройкой гиперпараметров на кросс вале (для кастомной метрики)

In [None]:
def grid_search_cv_custom(data: pd.DataFrame, n_splits: int = 3) -> None:
    def get_model(cols: Iterable[str], **kwargs: Any) -> Pipeline:
        return make_pipeline(transforms.FeatSelector(cols), RandomForestClassifier(random_state=42, **kwargs))

    X, y = modeling.get_xy(data)
    float_cols = X.select_dtypes(exclude=object).columns
    params = {
        "bootstrap": [True, False],
        # "max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        # "max_features": ["auto", "sqrt"],
        # "min_samples_leaf": [1, 2, 4],
        # "min_samples_split": [2, 5, 10],
        # "n_estimators": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    }

    best_metric = -np.inf
    best_kwargs: dict[str, Any] | None = None

    folds = list(KFold(n_splits=n_splits).split(X))

    for kwargs in tqdm(ParameterGrid(params)):
        fold_metrics: list[float] = []

        for train_i, test_i in folds:
            X_train, y_train = X.loc[train_i], y.loc[train_i]
            X_test, y_test = X.loc[test_i], y.loc[test_i]

            cur_pipe = get_model(float_cols, **kwargs)
            cur_pipe.fit(X_train, y_train)
            pred = cur_pipe.predict(X_test)
            cur_metric = business.profit(y_true=y_test, y_pred=pred, feats=X_test)
            fold_metrics.append(cur_metric)

        metric = float(np.mean(fold_metrics))
        if metric > best_metric:
            best_metric = metric
            best_kwargs = kwargs

    print(f"Best metric: {best_metric:.3f}")
    print(f"Best kwargs: {best_kwargs}")


grid_search_cv_custom(loan_data)

### Разработка текстовых фич

In [None]:
# transforms.TextFeaturesExtractor


def train_text(data: pd.DataFrame) -> None:
    df_train, df_test = modeling.get_train_test(data)

    X_train, y_train = modeling.get_xy(df_train)
    X_test, y_test = modeling.get_xy(df_test)

    cols = list(X_train.select_dtypes(exclude=object).columns)
    cols.append("речь")

    pipe = make_pipeline(
        transforms.FeatSelector(cols),
        transforms.TextFeaturesExtractor(text_feat="речь"),
        SimpleImputer(strategy="mean"),
        StandardScaler(),
        LogisticRegression(random_state=42),
    )
    pipe.fit(X_train, y_train)

    print("Train")
    pred = pipe.predict(X_train)
    modeling.print_metrics(y_pred=pred, y_true=y_train)
    print()

    print("Test")
    pred = pipe.predict(X_test)
    modeling.print_metrics(y_pred=pred, y_true=y_test)
    print()


train_text(loan_data)

# Бизнес

### Спроектировать бизнес-метрику для модели

In [None]:
# Наша реализация - business.profit