In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections.abc import Iterable
from typing import Any

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from tqdm import tqdm
from sklearn.impute import SimpleImputer

from src import business, modeling, transforms, vis
from src.const import DATA_PATH, TARGET

# Для более качественных графиков
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 150
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
age_mapping = {
    "45-54": 3,     
    "35-44": 2,    
    "55-64": 4,     
    "65-74": 5,  
    "25-34": 1,    
    ">74": 6,       
    "<25": 0, 
}

gender_mapping = {
    "м": 1,     
    "ж": 2,     
}


In [None]:
import os
import openai
from getpass import getpass


def _getpass(env_var: str):
    if not os.environ.get(env_var):
        os.environ[env_var] = getpass(f"{env_var}=")

_getpass("OPENAI_API_KEY") 

In [None]:
import json
def send_message(user_input):
    prompt = """
    You're a loan officer's assistant. I'll give you a customer's message, evaluate how reliable a customer he is. Evaluate its reliability on a scale from 1 to 10, and also describe the reasons.
    Much score is better (A more reliable borrower)
    In response, give json in this form {"reason": #text reasons#, "score": #Number of score#}
    
    User message:\n
    """
    messages = []
    messages.append({"role": "user", "content": prompt+user_input})

    completion = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    assistant_reply = completion.choices[0].message.content
    messages.append({"role": "assistant", "content": assistant_reply})
    try:
        return int(json.loads(assistant_reply)["score"])
    except Exception as e:
        print(e)
        return -1

In [6]:
loan_data = pd.read_csv(DATA_PATH)
# loan_data["llm_score"] = loan_data["речь"].map(send_message)

df_train, df_test = train_test_split(
    loan_data,
    test_size=0.2,       
    random_state=42,     
)
df_train_original = df_train.copy()
df_test_original = df_test.copy()

df_train["возраст"] = df_train["возраст"].map(age_mapping).fillna(7) 
df_test["возраст"] = df_test["возраст"].map(age_mapping).fillna(7) 

df_train["пол"] = df_train["пол"].map(gender_mapping).fillna(0) 
df_test["пол"] = df_test["пол"].map(gender_mapping).fillna(0) 

category_counts = df_train["сбор"].value_counts()
rare_categories = category_counts[category_counts < 5].index

df_train["сбор"] = df_train["сбор"].replace(rare_categories, 'RARE')
df_test["сбор"] = df_test["сбор"].replace(rare_categories, 'RARE')

sbor_mean_encoding = df_train.groupby('сбор')['дефолт'].mean()
df_train['сбор_mean_encoded'] = df_train.pop('сбор').map(sbor_mean_encoding)
df_test['сбор_mean_encoded'] = df_test.pop('сбор').map(sbor_mean_encoding)

liv_mean_encoding = df_train.groupby('проживание')['дефолт'].mean()
df_train['проживание_mean_encoded'] = df_train.pop('проживание').map(liv_mean_encoding)
df_test['проживание_mean_encoded'] = df_test.pop('проживание').map(liv_mean_encoding)

df_train.drop(columns=["тип_залога"], inplace=True)
df_test.drop(columns=["тип_залога"], inplace=True)

df_train["символов_в_речи"] = df_train.pop("речь").str.len()
df_test["символов_в_речи"] = df_test.pop("речь").str.len()

imputer = SimpleImputer(strategy='median')
imputer = imputer.fit(df_train)

df_train = pd.DataFrame(imputer.transform(df_train), columns=df_train.columns)
df_test = pd.DataFrame(imputer.transform(df_test), columns=df_test.columns)


In [10]:
def our_profit(loan_amount, y_pred, y_true):
    return np.where(
        (y_true == 1) & (y_pred == 0),
        -loan_amount * 0.5,
        np.where(
            (y_true == 0) & (y_pred == 0),
            loan_amount * 0.1,
            0
        )
    ).sum()

In [None]:
X_train = df_train.drop(columns=[TARGET])
y_train = df_train[TARGET].astype(int)

# X_test = df_test.drop(columns=[TARGET])
# y_test = df_test[TARGET].astype(int)

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

y_pred_train = clf.predict_proba(X_train)
y_pred_test = clf.predict_proba(X_test)

pts = []

for t in np.linspace(0, 1, 100):
    y_pred_train_t = (y_pred_train[:, 1] > t).astype(int)
    y_pred_test_t = (y_pred_test[:, 1] > t).astype(int)

    profit_train_t = our_profit(df_train["сумма"], y_pred_train_t, y_train)
    profit_test_t = our_profit(df_test["сумма"], y_pred_test_t, y_test)

    pts.append((t, profit_train_t, profit_test_t))
stats_t = pd.DataFrame(pts, columns=["t", "profit@train", "profit@test"])

# SUB

In [None]:
from pandas.core.frame import DataFrame


loan_data: DataFrame = pd.read_csv(DATA_PATH)
# loan_data["llm_score"] = loan_data["речь"].map(send_message)

df_train = loan_data

df_train["возраст"] = df_train["возраст"].map(age_mapping).fillna(7) 

df_train["пол"] = df_train["пол"].map(gender_mapping).fillna(0) 

category_counts = df_train["сбор"].value_counts()
rare_categories = category_counts[category_counts < 5].index

df_train["сбор"] = df_train["сбор"].replace(rare_categories, 'RARE')

sbor_mean_encoding = df_train.groupby('сбор')['дефолт'].mean()
df_train['сбор_mean_encoded'] = df_train.pop('сбор').map(sbor_mean_encoding)

liv_mean_encoding = df_train.groupby('проживание')['дефолт'].mean()
df_train['проживание_mean_encoded'] = df_train.pop('проживание').map(liv_mean_encoding)

df_train.drop(columns=["тип_залога"], inplace=True)

df_train["символов_в_речи"] = df_train.pop("речь").str.len()

imputer = SimpleImputer(strategy='median')
imputer = imputer.fit(df_train)

df_train = pd.DataFrame(imputer.transform(df_train), columns=df_train.columns)

In [5]:
X_train = df_train.drop(columns=[TARGET])
y_train = df_train[TARGET].astype(int)

In [7]:
def our_profit(loan_amount, y_pred, y_true):
    return np.where(
        (y_true == 1) & (y_pred == 0),
        -loan_amount * 0.5,
        np.where(
            (y_true == 0) & (y_pred == 0),
            loan_amount * 0.1,
            0
        )
    ).mean()

In [None]:
clf = RandomForestClassifier()
grid_search = GridSearchCV(
    estimator=clf,
    param_grid={"n_estimators": np.arange(3, 101), "max_depth": np.arange(2, 11)},
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=3
)
grid_search.fit(X_train, y_train)

clf = RandomForestClassifier(**grid_search.best_params_)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)

Fitting 3 folds for each of 882 candidates, totalling 2646 fits


NameError: name 'our_profit' is not defined

In [12]:
grid_search.best_params_

{'max_depth': 9, 'n_estimators': 37}

In [8]:
print("Бизнес-метрика на трейне", our_profit(df_train["сумма"], y_pred_train, y_train))
print("Точность на трейне:", clf.score(X_train, y_train))

Бизнес-метрика на трейне 16487.526173207447
Точность на трейне: 0.9295


In [13]:
test_loan_data = pd.read_csv("data/test_data.csv")
# test_loan_data["llm_score"] = test_loan_data["речь"].map(send_message)

test_loan_data_original = test_loan_data.copy()
test_loan_data["возраст"] = test_loan_data["возраст"].map(age_mapping).fillna(7) 
test_loan_data["пол"] = test_loan_data["пол"].map(gender_mapping).fillna(0) 
test_loan_data["сбор"] = test_loan_data["сбор"].replace(rare_categories, 'RARE')
test_loan_data['сбор_mean_encoded'] = test_loan_data.pop('сбор').map(sbor_mean_encoding)
test_loan_data['проживание_mean_encoded'] = test_loan_data.pop('проживание').map(liv_mean_encoding)
test_loan_data.drop(columns=["тип_залога"], inplace=True)
test_loan_data["символов_в_речи"] = test_loan_data.pop("речь").str.len()
test_loan_data = pd.DataFrame(imputer.transform(test_loan_data), columns=test_loan_data.columns)


In [14]:
X_test = test_loan_data.drop(columns=[TARGET])
y_test = test_loan_data[TARGET].astype(int)

In [23]:
# y_test_pred = (clf.predict_proba(X_test)[:, 1] > 0.3).astype(int)
y_test_pred = (clf.predict(X_test)).astype(int)

In [24]:
from src.business import profit

profit(y_test, y_test_pred, pd.read_csv("data/test_data.csv"))

55151145.13203423

In [20]:
(test_loan_data_original == pd.read_csv("data/test_data.csv")).all()

лимит_нарушен          False
пол                    False
тип                    False
цель                   False
кредитоспособность     False
другие_кредиты         False
бизнес                 False
сумма                  False
сбор                    True
срок                   False
амортизация            False
только_процент         False
один_платеж            False
стоимость_имущества    False
проживание              True
тип_залога              True
тип_кредита            False
кредитный_рейтинг      False
возраст                False
прямой_залог           False
речь                    True
дефолт                  True
dtype: bool

In [19]:
(y_test == y_test_pred).mean()

0.8715