In [21]:
import json
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import yaml
from sklearn.impute import SimpleImputer

In [4]:
config_path = "../config/params.yml"

In [5]:
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

In [7]:
preproc = config["preprocessing"]

# Import

In [35]:
data_test = pd.read_parquet(preproc["check_data_path"])
data_test

Unnamed: 0_level_0,WEEK_NUM,days30_165L,maritalst_385M,pmtscount_423L,requesttype_4525192L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,cntincpaycont9m_3716944L,...,purposeofcred_active,residualamount_closed,subjectrole_active,subjectrole_closed,empladdr,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1020556.0,89,1.0,0.0,,,2079.8000,0.0,,,,...,a55475b1,,a55475b1,a55475b1,a55475b1,0.0,,-1891.0,,0
1323846.0,12,0.0,2.0,17.0,,2417.6000,1115.4,1115.4000,17096.5530,3.0,...,a55475b1,,a55475b1,a55475b1,a55475b1,26.0,8.0,,0.0,0
791835.0,35,0.0,2.0,0.0,DEDUCTION_6,3195.4001,0.0,,,,...,a55475b1,,a55475b1,a55475b1,P8_43_166,0.0,,,,0
1942234.0,91,0.0,2.0,,,5585.2000,12626.2,9651.0000,92430.6500,26.0,...,a55475b1,,a55475b1,a55475b1,a55475b1,74.0,74.0,-3.0,0.0,4
1719341.0,51,0.0,0.0,,DEDUCTION_6,2019.0000,0.0,2987.6000,,0.0,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,27.0,30.0,-61.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860994.0,75,1.0,2.0,,,8695.4000,0.0,13655.4000,-4085.4001,3.0,...,60c73645,0.0,a55475b1,a55475b1,a55475b1,16.0,16.0,-214.0,0.0,3
1325574.0,12,2.0,3.0,9.0,,5418.2000,4995.2,3502.4001,11823.1430,7.0,...,96a8fdfe,,a55475b1,a55475b1,a55475b1,25.0,22.0,,0.0,1
2541088.0,5,0.0,0.0,0.0,,3833.0000,0.0,3191.4001,,8.0,...,a55475b1,,a55475b1,a55475b1,a55475b1,56.0,59.0,,0.0,5
2633267.0,41,3.0,0.0,,DEDUCTION_6,5126.4000,3199.0,3199.0000,100732.2000,9.0,...,60c73645,0.0,a55475b1,a55475b1,a55475b1,48.0,46.0,-302.0,0.0,6


# Preprocessing

In [24]:
def pipeline_preprocess(
    data: pd.DataFrame, is_predict: bool = False, **kwargs
) -> pd.DataFrame:
    """
    Preprocess dataset. Drop columns, set index, fill None for categorical features.

    Parameters
    ----------
    data: pd.DataFrame
        Dataset

    is_predict: bool, default = False
        name of target feature

    Returns
    -------
    data: pd.DataFrame
        preprocessed dataset
    """
    if is_predict:
        data.drop(kwargs["group_col"], axis=1, inplace=True, errors="ignore")
    else:
        data.set_index(kwargs["index_col"], inplace=True)
    data.drop(kwargs["drop_columns"], axis=1, inplace=True, errors="ignore")
    cat_features = data.select_dtypes(exclude=np.number).columns.tolist()

    if (
        is_predict
        and Path(kwargs["uniq_cat_values_path"]).exists()
        and Path(kwargs["min_max_num_values_path"]).exists()
    ):
        check_columns(data=data, **kwargs)
    imputer_cat = SimpleImputer(
        missing_values=None, strategy="constant", fill_value="None"
    )
    data[cat_features] = imputer_cat.fit_transform(data[cat_features])
    data[cat_features] = data[cat_features].astype("category")
    for col in kwargs["transform_cols"]:
        data[col] = data[col].apply(lambda x: 0 if x > 0 else -x)

    return data

In [25]:
def check_columns(data: pd.DataFrame, **kwargs):
    """
    Check columns set is same like for trained model.

    Parameters
    ----------
    data: pd.DataFrame
        Dataset

    Returns
    -------
    None
    """
    cat_features = data.select_dtypes(exclude=np.number).columns.tolist()
    num_features = data.select_dtypes(include=np.number).columns.tolist()
    if kwargs["target_col"] in data.columns:
        num_features.remove(kwargs["target_col"])
    if kwargs["group_col"] in data.columns:
        num_features.remove(kwargs["group_col"])

    with open(kwargs["uniq_cat_values_path"]) as json_file:
        uniq_cat_values = json.load(json_file)
    with open(kwargs["min_max_num_values_path"]) as json_file:
        min_max_num_values = json.load(json_file)
    assert set(uniq_cat_values.keys()) == set(cat_features) and set(
        min_max_num_values.keys()
    ) == set(num_features), "Different features set"

In [36]:
data_proc = pipeline_preprocess(data_test, is_predict=True, **preproc)
data_proc

Unnamed: 0_level_0,days30_165L,maritalst_385M,pmtscount_423L,requesttype_4525192L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,cntincpaycont9m_3716944L,cntpmts24_3658933L,...,purposeofcred_active,residualamount_closed,subjectrole_active,subjectrole_closed,empladdr,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1020556.0,1.0,0.0,,,2079.8000,0.0,,,,,...,a55475b1,,a55475b1,a55475b1,a55475b1,0.0,,-1891.0,,0
1323846.0,0.0,2.0,17.0,,2417.6000,1115.4,1115.4000,17096.5530,3.0,5.0,...,a55475b1,,a55475b1,a55475b1,a55475b1,26.0,8.0,,0.0,0
791835.0,0.0,2.0,0.0,DEDUCTION_6,3195.4001,0.0,,,,,...,a55475b1,,a55475b1,a55475b1,P8_43_166,0.0,,,,0
1942234.0,0.0,2.0,,,5585.2000,12626.2,9651.0000,92430.6500,26.0,24.0,...,a55475b1,,a55475b1,a55475b1,a55475b1,74.0,74.0,-3.0,0.0,4
1719341.0,0.0,0.0,,DEDUCTION_6,2019.0000,0.0,2987.6000,,0.0,9.0,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,27.0,30.0,-61.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860994.0,1.0,2.0,,,8695.4000,0.0,13655.4000,-4085.4001,3.0,14.0,...,60c73645,0.0,a55475b1,a55475b1,a55475b1,16.0,16.0,-214.0,0.0,3
1325574.0,2.0,3.0,9.0,,5418.2000,4995.2,3502.4001,11823.1430,7.0,17.0,...,96a8fdfe,,a55475b1,a55475b1,a55475b1,25.0,22.0,,0.0,1
2541088.0,0.0,0.0,0.0,,3833.0000,0.0,3191.4001,,8.0,9.0,...,a55475b1,,a55475b1,a55475b1,a55475b1,56.0,59.0,,0.0,5
2633267.0,3.0,0.0,,DEDUCTION_6,5126.4000,3199.0,3199.0000,100732.2000,9.0,19.0,...,60c73645,0.0,a55475b1,a55475b1,a55475b1,48.0,46.0,-302.0,0.0,6


In [29]:
data_proc.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1000 entries, 1020556.0 to 1298412.0
Columns: 188 entries, days30_165L to sellerplace_cnt
dtypes: category(48), float64(130), int32(9), int8(1)
memory usage: 1.1 MB


# Predict

In [37]:
model = joblib.load(config['train']['tuned_model_path'])
cols_order = ['target', 'score'] + data_proc.columns.tolist()
data_proc['target'] = model.predict(data_proc)
data_proc['score'] = model.predict_proba(data_proc)[:, 1]
data_proc = data_proc.loc[:, cols_order]

In [38]:
data_proc

Unnamed: 0_level_0,target,score,days30_165L,maritalst_385M,pmtscount_423L,requesttype_4525192L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,...,purposeofcred_active,residualamount_closed,subjectrole_active,subjectrole_closed,empladdr,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1020556.0,0,0.045198,1.0,0.0,,,2079.8000,0.0,,,...,a55475b1,,a55475b1,a55475b1,a55475b1,0.0,,-1891.0,,0
1323846.0,1,0.713520,0.0,2.0,17.0,,2417.6000,1115.4,1115.4000,17096.5530,...,a55475b1,,a55475b1,a55475b1,a55475b1,26.0,8.0,,0.0,0
791835.0,1,0.516494,0.0,2.0,0.0,DEDUCTION_6,3195.4001,0.0,,,...,a55475b1,,a55475b1,a55475b1,P8_43_166,0.0,,,,0
1942234.0,0,0.102280,0.0,2.0,,,5585.2000,12626.2,9651.0000,92430.6500,...,a55475b1,,a55475b1,a55475b1,a55475b1,74.0,74.0,-3.0,0.0,4
1719341.0,0,0.046583,0.0,0.0,,DEDUCTION_6,2019.0000,0.0,2987.6000,,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,27.0,30.0,-61.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860994.0,1,0.556371,1.0,2.0,,,8695.4000,0.0,13655.4000,-4085.4001,...,60c73645,0.0,a55475b1,a55475b1,a55475b1,16.0,16.0,-214.0,0.0,3
1325574.0,0,0.122775,2.0,3.0,9.0,,5418.2000,4995.2,3502.4001,11823.1430,...,96a8fdfe,,a55475b1,a55475b1,a55475b1,25.0,22.0,,0.0,1
2541088.0,0,0.064286,0.0,0.0,0.0,,3833.0000,0.0,3191.4001,,...,a55475b1,,a55475b1,a55475b1,a55475b1,56.0,59.0,,0.0,5
2633267.0,1,0.620273,3.0,0.0,,DEDUCTION_6,5126.4000,3199.0,3199.0000,100732.2000,...,60c73645,0.0,a55475b1,a55475b1,a55475b1,48.0,46.0,-302.0,0.0,6


# Вывод

Показана процедура предсказаний значений для загруженного из файла датасета.