In [234]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import warnings

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from fancyimpute import KNN
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [235]:
data = pd.read_csv('../raw_data/dataset.csv',sep=';')
data = data.copy()
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [236]:
data.shape

(99976, 43)

## 1. Preprocessing workflow 

 ### 1.2. Missing values 

In [237]:
# Missing data percentage
# round((data.isnull().sum()/len(data)).sort_values(ascending=False),2)

#### we do have a few features having a high percetange of missing values 
#### Let's investigate the meaning of these features and the besty way to deal with mising values

In [238]:
# unique values in this feature
data['worst_status_active_inv'].unique()

array([ 1., nan,  2.,  3.])

In [239]:
# account_worst_status_12_24m
data['account_worst_status_12_24m'].unique()

array([nan,  1.,  2.,  3.,  4.])

In [240]:
data['account_worst_status_12_24m'].isna().sum() / len(data['account_worst_status_12_24m'])

0.6677702648635673

In [241]:
# let's drop this worst_status_active_inv feature it has a too high missing values 
# del data['worst_status_active_inv']


##  Handling Missing Data with Imputation

## Preprocessing
### Pipeline 

### a) Ordinal Encoding 
#### Explanation - Ordinal Encoding or One-Hot-Encoding: <https://stackoverflow.com/questions/69052776/ordinal-encoding-or-one-hot-encoding>

In [242]:
 # converting these columns to "object" type
list_float_to_obj = ["worst_status_active_inv", "account_status","account_worst_status_0_3m",
                     "account_worst_status_12_24m", "account_worst_status_3_6m", "account_worst_status_6_12m",
                     "status_last_archived_0_24m", "status_2nd_last_archived_0_24m","status_3rd_last_archived_0_24m",
                     "status_max_archived_0_6_months","status_max_archived_0_12_months","status_max_archived_0_24_months",
                     "has_paid"]

_ = [data.__setitem__(feature, data[feature].astype("object")) for feature in list_float_to_obj]


In [248]:
data['has_paid'].unique()

array([True, False], dtype=object)

In [243]:
data.select_dtypes('object').columns    

Index(['uuid', 'account_status', 'account_worst_status_0_3m',
       'account_worst_status_12_24m', 'account_worst_status_3_6m',
       'account_worst_status_6_12m', 'merchant_category', 'merchant_group',
       'has_paid', 'name_in_email', 'status_last_archived_0_24m',
       'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m',
       'status_max_archived_0_6_months', 'status_max_archived_0_12_months',
       'status_max_archived_0_24_months', 'worst_status_active_inv'],
      dtype='object')

In [244]:
feat_ordinal_dict = {
    # considers "missing" as "neutral"
    "account_status": ['missing', 1.0, 2.0, 3.0],
    "account_worst_status_0_3m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_12_24m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_3_6m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_6_12m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "has_paid": ['missing', 'True'],
    "status_last_archived_0_24m": ['missing', 1, 2, 3, 0],
    "status_2nd_last_archived_0_24m": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "status_3rd_last_archived_0_24m": ['missing', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    "status_max_archived_0_6_months": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "status_max_archived_0_12_months": ['missing', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    "status_max_archived_0_24_months": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "worst_status_active_inv": ['missing', 'Unf', 'RFn', 'Fin']
}

feat_ordinal = sorted(feat_ordinal_dict.keys()) # sort alphabetically
feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]

encoder_ordinal = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
    dtype= np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1 # Considers unknown values as worse than "missing"
)

preproc_ordinal = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    encoder_ordinal,
    MinMaxScaler()
)

preproc_ordinal

NameError: name 'OrdinalEncoder' is not defined