In [1]:
from collections.abc import Iterable
from typing import Any

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from tqdm import tqdm
import seaborn as sns
import dataframe_image as dfi
from src.features import numeric_features, categorical_features

from src import business, modeling, transforms, vis
from src.const import DATA_PATH, DATA_TEST_PATH, TARGET, DATA_TEST_TARGETS_PATH

# Для более качественных графиков
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 150
%load_ext autoreload
%autoreload 2

In [52]:
def impute_col_by_important(df, important_features, col, df_test):
    df = df.copy()
    feats = important_features + [col]
    imputer = KNNImputer(n_neighbors=2)
    res = imputer.fit_transform(df[feats])
    df[col] = res[:, -1]
    df_test[col] = imputer.fit_transform(df_test[feats])[:, -1]
    return df, df_test


loan_data = pd.read_csv(DATA_PATH)
loan_data_test = pd.read_csv(DATA_TEST_PATH)

In [53]:
loan_data_imp, loan_data_test_imp = impute_col_by_important(
    loan_data, 
    ["лимит_нарушен", "тип", "другие_кредиты", "сумма"], 
    "стоимость_имущества",
    loan_data_test
)
loan_data_imp, loan_data_test_imp = impute_col_by_important(
    loan_data_imp, 
    ["цель", "бизнес", "сумма"], 
    "срок",
    loan_data_test
)
loan_data_imp, loan_data_test_imp = impute_col_by_important(
    loan_data_imp, 
    ["лимит_нарушен", "тип", ], 
    "сумма",
    loan_data_test
)


In [55]:
to_encode = ["пол", "тип_залога", "возраст", "проживание"]
categorical_features_data = loan_data[categorical_features]
encoder = OrdinalEncoder(unknown_value=10, handle_unknown="use_encoded_value").fit(
    categorical_features_data[to_encode]
)

loan_data_imp_enc = loan_data_imp.copy()
loan_data_imp_enc[to_encode] = encoder.transform(categorical_features_data[to_encode])

loan_data_test_imp_enc = loan_data_test_imp.copy()
categorical_features_data = loan_data_test[to_encode]
loan_data_test_imp_enc[to_encode] = encoder.transform(categorical_features_data[to_encode])

In [56]:
loan_data_imp_enc["прямой_залог"] = loan_data_imp_enc["прямой_залог"].fillna(-1)
loan_data_imp_enc["другие_кредиты"] = loan_data_imp_enc["другие_кредиты"].fillna(-1)

loan_data_test_imp_enc["прямой_залог"] = loan_data_test_imp_enc["прямой_залог"].fillna(-1)
loan_data_test_imp_enc["другие_кредиты"] = loan_data_test_imp_enc["другие_кредиты"].fillna(-1)

In [57]:
s = loan_data_imp_enc.groupby(["сбор", "дефолт"]).size().unstack().fillna(0)
s = s.div(s.sum(axis=1), axis=0)
s.sort_values(by=1, ascending=False)
m = dict(zip(s.index, s[1]))

loan_data_imp_enc["сбор"] = loan_data_imp_enc["сбор"].map(m)
loan_data_test_imp_enc["сбор"] = loan_data_test_imp_enc["сбор"].map(m)

In [58]:
def combine_genders(s1, s2, s3, s4):
    combined = pd.Series(index=s1.index, dtype=int)  
    combined[s1] = 0
    combined[s2] = combined.where(combined == 0, 1)  
    combined[s3] = combined.where(combined.isin([0, 1]), 2)  
    combined[s4] = combined.where(combined.isin([0, 1, 2]), 3) 
    
    return combined

couples = loan_data_imp_enc["речь"].str.contains(r"(we|couple)")
male = loan_data_imp_enc["речь"].str.contains(r"(man|male|guy)")
female = loan_data_imp_enc["речь"].str.contains(r"(woman)")
non_binary = loan_data_imp_enc["речь"].str.contains(r"(non-binary)")
new_gender = combine_genders(female, male, couples, non_binary)
loan_data_imp_enc["пол"] = loan_data_imp_enc["пол"].fillna(new_gender)

couples = loan_data_test_imp_enc["речь"].str.contains(r"(we|couple)")
male = loan_data_test_imp_enc["речь"].str.contains(r"(man|male|guy)")
female = loan_data_test_imp_enc["речь"].str.contains(r"(woman)")
non_binary = loan_data_test_imp_enc["речь"].str.contains(r"(non-binary)")
new_gender = combine_genders(female, male, couples, non_binary)
loan_data_test_imp_enc["пол"] = loan_data_test_imp_enc["пол"].fillna(new_gender)

  couples = loan_data_imp_enc["речь"].str.contains(r"(we|couple)")
  male = loan_data_imp_enc["речь"].str.contains(r"(man|male|guy)")
  female = loan_data_imp_enc["речь"].str.contains(r"(woman)")
  non_binary = loan_data_imp_enc["речь"].str.contains(r"(non-binary)")
  couples = loan_data_test_imp_enc["речь"].str.contains(r"(we|couple)")
  male = loan_data_test_imp_enc["речь"].str.contains(r"(man|male|guy)")
  female = loan_data_test_imp_enc["речь"].str.contains(r"(woman)")
  non_binary = loan_data_test_imp_enc["речь"].str.contains(r"(non-binary)")


In [60]:
cols_to_use

Index(['лимит_нарушен', 'пол', 'тип', 'цель', 'кредитоспособность',
       'другие_кредиты', 'бизнес', 'сумма', 'сбор', 'срок', 'амортизация',
       'только_процент', 'один_платеж', 'стоимость_имущества', 'проживание',
       'тип_залога', 'тип_кредита', 'кредитный_рейтинг', 'возраст',
       'прямой_залог', 'дефолт'],
      dtype='object')

In [61]:
imputer2 = KNNImputer(n_neighbors=2)
cols_to_use = loan_data_imp_enc.drop(columns=["речь"]).columns
res = imputer2.fit_transform(loan_data_imp_enc[cols_to_use])

loan_data_imp_enc_imp2 = loan_data_imp_enc.copy()
loan_data_imp_enc_imp2[cols_to_use] = res
loan_data_imp_enc_imp2_no_speech = loan_data_imp_enc_imp2.drop(columns=["речь"])

loan_data_test_imp_enc_imp2 = loan_data_test_imp_enc.copy()

cols_to_use = ['лимит_нарушен', 'пол', 'тип', 'цель', 'кредитоспособность',
       'другие_кредиты', 'бизнес', 'сумма', 'сбор', 'срок', 'амортизация',
       'только_процент', 'один_платеж', 'стоимость_имущества', 'проживание',
       'тип_залога', 'тип_кредита', 'кредитный_рейтинг', 'возраст',
       'прямой_залог']
res = imputer2.fit_transform(loan_data_test_imp_enc_imp2[cols_to_use])
loan_data_test_imp_enc_imp2[cols_to_use] = res
loan_data_test_imp_enc_imp2_no_speech = loan_data_test_imp_enc_imp2.drop(columns=["речь"])

In [74]:
loan_data_test_imp_enc_imp2_before_scaling = loan_data_test_imp_enc_imp2.copy().drop(columns=["речь"])
loan_data_test_imp_enc_imp2_before_scaling

Unnamed: 0,ID,лимит_нарушен,пол,тип,цель,кредитоспособность,другие_кредиты,бизнес,сумма,сбор,...,амортизация,только_процент,один_платеж,стоимость_имущества,проживание,тип_залога,тип_кредита,кредитный_рейтинг,возраст,прямой_залог
0,d61968aa-7fb8-46b2-8573-3d101d3ebe5b,1.0,1.0,3.0,1.0,1.0,0.0,0.0,154765.264968,0.000000,...,0.0,0.0,0.0,2.244881e+05,1.0,0.0,2.0,783.508335,1.0,1.0
1,760116cf-25b4-47aa-9a20-e3baa5fc6d09,0.0,2.0,3.0,0.5,1.0,0.0,1.0,434765.264968,0.000000,...,0.0,0.0,0.0,1.099488e+06,2.0,0.0,3.0,568.508335,0.0,1.0
2,a0a11839-e745-4704-98df-bc3ca0d4de3d,0.0,1.0,1.0,0.0,1.0,0.0,0.0,474765.264968,0.000000,...,0.0,0.0,0.0,4.844881e+05,2.0,0.0,1.0,843.508335,3.0,-1.0
3,dce70289-411a-415b-a88b-4cfb708175fd,0.0,1.0,1.0,3.0,1.0,0.0,0.0,304765.264968,0.000000,...,0.0,0.0,0.0,4.744881e+05,2.0,0.0,2.0,905.508335,0.0,1.0
4,4ad07831-8a0a-4ef0-9bdd-fef5496e186a,0.0,1.0,3.0,1.0,1.0,0.0,0.0,294765.264968,0.000000,...,0.0,0.0,0.0,4.444881e+05,2.0,0.0,1.0,522.508335,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,dbafd3b1-50ff-4ea7-83f5-dc20d065699a,1.0,1.0,3.0,0.0,1.0,0.0,0.0,294765.264968,0.645833,...,0.0,0.0,0.0,4.944881e+05,2.0,0.0,2.0,546.508335,4.0,1.0
1996,70d18e9e-e435-4335-ad4b-2c1392f0ae83,0.0,0.0,3.0,0.0,1.0,0.0,0.0,344765.264968,0.000000,...,0.0,0.0,0.0,6.944881e+05,2.0,0.0,1.0,803.508335,3.0,1.0
1997,60d8039e-92aa-41d5-a1e7-2ff58cf50927,0.0,0.0,3.0,0.0,1.0,0.0,0.0,524765.264968,0.000000,...,0.0,0.0,0.0,6.744881e+05,2.0,0.0,1.0,543.508335,1.0,-1.0
1998,4135b443-2576-4869-9765-f41889939c6e,0.0,2.0,3.0,0.0,1.0,0.0,0.0,624765.264968,0.000000,...,0.0,0.0,0.0,8.644881e+05,2.0,0.0,2.0,772.508335,2.0,1.0


In [62]:
t = loan_data_imp_enc_imp2[TARGET]
df_train, df_test = train_test_split(
    loan_data_imp_enc_imp2_no_speech, 
    test_size=0.3, 
    random_state=42,
    stratify=t
)

In [63]:
df_train_before_scaling = df_train.copy()
df_test_before_scaling = df_test.copy()

In [64]:
to_scandard = ["сумма", "срок", "стоимость_имущества", "кредитный_рейтинг"]

scaler = StandardScaler()
df_train[to_scandard] = scaler.fit_transform(df_train[to_scandard])
df_test[to_scandard] = scaler.transform(df_test[to_scandard])


loan_data_test_imp_enc_imp2_no_speech[to_scandard] = scaler.transform(loan_data_test_imp_enc_imp2_no_speech[to_scandard])


In [66]:
def business_exp_profit(df, default_prob):
    return (
        df["сумма"]*0.1/365*df["срок"]*(1-default_prob) +
        (df["стоимость_имущества"]*df["прямой_залог"] - df["сумма"]) * default_prob
    ).sum()

def business_el(df, default_prob):
    return ((df["стоимость_имущества"]*df["прямой_залог"] - df["сумма"]) * default_prob).sum()

In [67]:
X_train, y_train = df_train.drop(columns=[TARGET]), df_train[TARGET]
X_val, y_val = df_test.drop(columns=[TARGET]), df_test[TARGET]

In [68]:
from sklearn.neighbors import KNeighborsClassifier

classifier_knn = KNeighborsClassifier(n_neighbors=2)
classifier_knn.fit(X_train, y_train)
y_proba_val = classifier_knn.predict_proba(X_val)
print(
    business_el(df_test_before_scaling, y_proba_val[:, 1]),
    business_exp_profit(df_test_before_scaling, y_proba_val[:, 1]),
    sep="\n"
)

-20201682.441113845
43543750.59935545


In [81]:
X_test = loan_data_test_imp_enc_imp2_no_speech.drop(columns=["ID"])
y_proba_test = classifier_knn.predict_proba(X_test)
y_pred_test = classifier_knn.predict(X_test)

In [77]:
y_true = pd.read_csv(DATA_TEST_TARGETS_PATH)
y_true

Unnamed: 0,ID,дефолт,Usage
0,d61968aa-7fb8-46b2-8573-3d101d3ebe5b,0,Public
1,760116cf-25b4-47aa-9a20-e3baa5fc6d09,0,Public
2,a0a11839-e745-4704-98df-bc3ca0d4de3d,0,Public
3,dce70289-411a-415b-a88b-4cfb708175fd,0,Public
4,4ad07831-8a0a-4ef0-9bdd-fef5496e186a,0,Public
...,...,...,...
1995,dbafd3b1-50ff-4ea7-83f5-dc20d065699a,0,Private
1996,70d18e9e-e435-4335-ad4b-2c1392f0ae83,0,Private
1997,60d8039e-92aa-41d5-a1e7-2ff58cf50927,0,Private
1998,4135b443-2576-4869-9765-f41889939c6e,0,Private


In [82]:
y_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
from src.business import profit


profit(
    y_true=y_true["дефолт"],
    y_pred=y_pred_test,
    feats=X_test
)

154491828.73157933

In [87]:
from src.business import profit

loan_data_test_reloaded = pd.read_csv(DATA_TEST_PATH)
profit(
    y_true=y_true["дефолт"],
    y_pred=y_pred_test,
    feats=loan_data_test_reloaded
)

25550101.478335764

In [88]:
f1_score(y_true["дефолт"], y_pred_test)

0.2517482517482518

In [90]:
from sklearn.metrics import accuracy_score


accuracy_score(y_true["дефолт"], y_pred_test)

0.786