In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import warnings

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from fancyimpute import KNN
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from tempfile import mkdtemp
from shutil import rmtree


In [2]:
from scipy import stats

# sklearn preproc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

In [3]:
data = pd.read_csv('../raw_data/dataset.csv',sep=';')
data = data.copy()
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [4]:
data.shape

(99976, 43)

## 1. Preprocessing workflow 

 ### 1.2. Missing values 

In [5]:
# Missing data percentage
# round((data.isnull().sum()/len(data)).sort_values(ascending=False),2)

##  Handling Missing Data with Imputation

## Preprocessing
### Pipeline 

### a) Ordinal Encoding 
#### Explanation - [Ordinal Encoding or One-Hot-Encoding](https://stackoverflow.com/questions/69052776/ordinal-encoding-or-one-hot-encoding)

In [6]:
# removing column "uuid" from the dataset
data_id = data['uuid']
data.drop(['uuid'], axis=1, inplace=True)
# setting target and removing the "default" from dataset
target = data['default'].dropna()
data.drop(['default'], axis=1, inplace=True)
# isna and isnull were not recognizing "nan" 
data = data.replace('nan', np.NaN)


In [7]:
 # converting these columns to "object" type
list_float_to_obj = ["worst_status_active_inv", "account_status","account_worst_status_0_3m",
                     "account_worst_status_12_24m", "account_worst_status_3_6m", "account_worst_status_6_12m",
                     "status_last_archived_0_24m", "status_2nd_last_archived_0_24m","status_3rd_last_archived_0_24m",
                     "status_max_archived_0_6_months","status_max_archived_0_12_months","status_max_archived_0_24_months",
                     "has_paid"]

_ = [data.__setitem__(feature, data[feature].astype("object")) for feature in list_float_to_obj]


In [8]:
feat_ordinal_dict = {
    # considers "missing" as "neutral"
    "account_status": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_0_3m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_12_24m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_3_6m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_6_12m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "has_paid": ['True', 'False'],
    "status_last_archived_0_24m": [1, 0, 2, 3, 5],
    "status_2nd_last_archived_0_24m": [1, 0, 2, 3, 5],
    "status_3rd_last_archived_0_24m": [1, 0, 2, 3, 5],
    "status_max_archived_0_6_months": [1, 0, 2, 3],
    "status_max_archived_0_12_months": [1, 2, 0, 3, 5],
    "status_max_archived_0_24_months": [1, 2, 0, 3, 5],
    "worst_status_active_inv": ['missing', 1.0, 2.0, 3.0]
}

feat_ordinal = sorted(feat_ordinal_dict.keys()) # sort alphabetically
feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]

encoder_ordinal = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
    dtype= np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1 # Considers unknown values as worse than "missing"
)

preproc_ordinal = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    encoder_ordinal,
    MinMaxScaler() #afterwards I can take a look at this point
)

preproc_ordinal

In [9]:
feat_numerical = sorted(data.select_dtypes(include=["int64", "float64"]).columns)

preproc_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler()
)

In [10]:
# Define nominal features to one-hot-encode as the remaining ones (non numerical, non ordinal)
feat_nominal = sorted(list(set(data.columns) - set(feat_numerical) - set(feat_ordinal)))

preproc_nominal = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

In [11]:
preproc = make_column_transformer(
    (preproc_numerical, feat_numerical),
    (preproc_ordinal, feat_ordinal),
    (preproc_nominal, feat_nominal),
    remainder="drop")

preproc

In [12]:
# pd.DataFrame(preproc.fit_transform(data,target)).head()
# Get column names after preprocessing
column_names = preproc.fit(data).get_feature_names_out()
# Create new DataFrame with transformed data and column names
data_preproc = pd.DataFrame(preproc.transform(data), columns=column_names, dtype=np.float16)

KeyboardInterrupt: 

### b) Statistical Feature Selection

#### Removing feature with low variance 
#### VarainceTreshold is a simple basilen approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 
### Here it would be interesting to have an cutoff value in line with the feature. e.g. "has_paid" is a boolean feature assuming "True" or "False" values. Thus, it's likely to ~ Bernoulli and Var[X] = p(1-p). The other categorical features could ~ Binomial. For parsimony we're not differentiating by feature. 
#### This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. 
#### There're other feature selection methods - [Feature Selection](https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold)

In [None]:
def number_feature_remaining(cutoff=0):
    preproc_transformer = make_column_transformer(
        (preproc_numerical, feat_numerical),
        (preproc_ordinal, feat_ordinal),
        (preproc_nominal, feat_nominal),
        remainder="drop")

    preproc_selector = VarianceThreshold(cutoff)

    preproc = make_pipeline(
        preproc_transformer,
        preproc_selector)

    return preproc.fit_transform(X_train).shape[1]

cutoff_values = np.arange(0, 0.2, 0.01)
plt.plot(cutoff_values, [number_feature_remaining(t) for t in cutoff_values], marker='x')
plt.xlabel("chosen feature variance cutoff values")
plt.title("Number of Feature Remaining");

In [14]:
# Pipeline final version  - prepoc 
encoder_ordinal = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
    dtype= np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1 # Considers unknown values as worse than "missing"
)

preproc_ordinal = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    encoder_ordinal,
    MinMaxScaler()
)

preproc_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler()
)

preproc_transformer = make_column_transformer(
    (preproc_numerical, make_column_selector(dtype_include=["int64", "float64"])),
    (preproc_ordinal, feat_ordinal),
    (preproc_nominal, feat_nominal),
    remainder="drop")

preproc_selector = VarianceThreshold(0)


preproc = make_pipeline(
    preproc_transformer,
    preproc_selector
)
preproc

In [None]:
# Check shape
preproc_fitted = preproc.fit(data,target)
preproc_fitted.transform(data).shape


In [None]:

allow_grid_searching = False # use True to activate girdsearch in the notebook cells below
# Cache the preprocessing step of the pipeline

cachedir = mkdtemp()

In [None]:
# Define numerical feature once-for-all 
# tenho que definir X de acordo com o o meu dataset okok
# 1. tenho de remover o target okok
# 2. verifcar os formatos "inte64" e "float64" okok
# 3 quero obter um dataframe com os nomes das categorias. okko
    ## add:
    # # Get column names after preprocessing
    # column_names = preproc.fit(data).get_feature_names_out()
    # # Create new DataFrame with transformed data and column names
    # data_preproc = pd.DataFrame(preproc_baseline.transform(data), columns=column_names, dtype=np.float16)
    # naming atualizado. 
# 4 remover todo o resto ficar só com a final pipeline 
# 5 ver o cutoff, pode ser interensante ter um valor maior 
# 6 ver mais informações sobre y_log e ver se temos uma normal distribution okok 



In [None]:
# para avançar: ver rmse and rmse_neg. é valido para o meu caso, visto que a tenho um classification pred
# obejctivo ver todas os passos aplicar e ver os resultados e APRENDER!