In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

from tempfile import mkdtemp
from shutil import rmtree

In [26]:
from scipy import stats
from scipy.stats import uniform

# sklearn preproc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold




In [3]:
data = pd.read_csv('../raw_data/dataset.csv',sep=';')
data = data.copy()
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [4]:
data.shape

(99976, 43)

## 1. Preprocessing workflow 

 ### 1.2. Missing values 

In [None]:
# Missing data percentage
# round((data.isnull().sum()/len(data)).sort_values(ascending=False),2)

##  Handling Missing Data with Imputation

## Preprocessing
### Pipeline 

### a) Ordinal Encoding 
#### Explanation - [Ordinal Encoding or One-Hot-Encoding](https://stackoverflow.com/questions/69052776/ordinal-encoding-or-one-hot-encoding)

In [5]:
# # removing column "uuid" from the dataset
# data_id = data['uuid']
# data.drop(['uuid'], axis=1, inplace=True)
# # setting target and removing the "default" from dataset
# target = data['default']
# data.drop(['default'], axis=1, inplace=True)
# # isna and isnull were not recognizing "nan" 
# data = data.replace('nan', np.NaN)
###
# removing the missing values of the target and these raws from the dataset
###
# removing coulumn "uuid" from the dataset
data_id = data['uuid']
#_ = data.reset_index()
#data = _.drop(['uuid'], axis=1)
data.drop(['uuid'], axis=1, inplace=True)
# setting target and removing the "default" from dataset
target = data['default'].dropna()
data.dropna(subset=['default'], axis=0, inplace=True)
data.drop(['default'], axis=1, inplace=True)
# isna and isnull were not recognizing "nan" 
data = data.replace('nan', np.NaN)


In [6]:
 # converting these columns to "object" type
list_float_to_obj = ["worst_status_active_inv", "account_status","account_worst_status_0_3m",
                     "account_worst_status_12_24m", "account_worst_status_3_6m", "account_worst_status_6_12m",
                     "status_last_archived_0_24m", "status_2nd_last_archived_0_24m","status_3rd_last_archived_0_24m",
                     "status_max_archived_0_6_months","status_max_archived_0_12_months","status_max_archived_0_24_months",
                     "has_paid"]

_ = [data.__setitem__(feature, data[feature].astype("object")) for feature in list_float_to_obj]


In [7]:
feat_ordinal_dict = {
    # considers "missing" as "neutral"
    "account_status": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_0_3m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_12_24m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_3_6m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "account_worst_status_6_12m": ['missing', 1.0, 2.0, 3.0, 4.0],
    "has_paid": ['True', 'False'],
    "status_last_archived_0_24m": [1, 0, 2, 3, 5],
    "status_2nd_last_archived_0_24m": [1, 0, 2, 3, 5],
    "status_3rd_last_archived_0_24m": [1, 0, 2, 3, 5],
    "status_max_archived_0_6_months": [1, 0, 2, 3],
    "status_max_archived_0_12_months": [1, 2, 0, 3, 5],
    "status_max_archived_0_24_months": [1, 2, 0, 3, 5],
    "worst_status_active_inv": ['missing', 1.0, 2.0, 3.0]
}

feat_ordinal = sorted(feat_ordinal_dict.keys()) # sort alphabetically
feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]


### b) Statistical Feature Selection

#### Removing feature with low variance 
#### VarainceTreshold is a simple basilen approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 
### Here it would be interesting to have an cutoff value in line with the feature. e.g. "has_paid" is a boolean feature assuming "True" or "False" values. Thus, it's likely to ~ Bernoulli and Var[X] = p(1-p). The other categorical features could ~ Binomial. For parsimony we're not differentiating by feature. 
#### This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. 
#### There're other feature selection methods - [Feature Selection](https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold)

In [None]:
# def number_feature_remaining(cutoff=0):
#     preproc_transformer = make_column_transformer(
#         (preproc_numerical, feat_numerical),
#         (preproc_ordinal, feat_ordinal),
#         (preproc_nominal, feat_nominal),
#         remainder="drop")

#     preproc_selector = VarianceThreshold(cutoff)

#     preproc = make_pipeline(
#         preproc_transformer,
#         preproc_selector)

#     return preproc.fit_transform(X_train).shape[1]

# cutoff_values = np.arange(0, 0.2, 0.01)
# plt.plot(cutoff_values, [number_feature_remaining(t) for t in cutoff_values], marker='x')
# plt.xlabel("chosen feature variance cutoff values")
# plt.title("Number of Feature Remaining");

In [8]:
# Defining Numercal features
feat_numerical = sorted(data.select_dtypes(include=["int64", "float64"]).columns)

In [9]:
# Define nominal features to one-hot-encode as the remaining ones (non numerical, non ordinal)
feat_nominal = sorted(list(set(data.columns) - set(feat_numerical) - set(feat_ordinal)))

preproc_nominal = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

In [10]:
from sklearn.feature_selection import SelectPercentile, mutual_info_regression

In [11]:
# Pipeline final version  - prepoc 
encoder_ordinal = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
    dtype= np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1 # Considers unknown values as worse than "missing"
)

preproc_ordinal = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    encoder_ordinal,
    MinMaxScaler()
)

preproc_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler()
)

preproc_transformer = make_column_transformer(
    (preproc_numerical, make_column_selector(dtype_include=["int64", "float64"])),
    (preproc_ordinal, feat_ordinal),
    (preproc_nominal, feat_nominal),
    remainder="drop")

preproc_selector = SelectPercentile(
    mutual_info_regression,
    percentile=50, # keep only xx% of all features )
)

# preproc_selector = VarianceThreshold(0)

preproc = make_pipeline(
    preproc_transformer,
    preproc_selector
)
preproc

In [None]:
# # pd.DataFrame(preproc.fit_transform(data,target)).head()
# # Get column names after preprocessing
# column_names = preproc.fit(data).get_feature_names_out()
# # Create new DataFrame with transformed data and column names
# data_preproc = pd.DataFrame(preproc.transform(data), columns=column_names, dtype=np.float16)

In [12]:
# random_state=42 we want to compare the performance of diferent models
X_train, X_test, y_train, y_test = train_test_split(data,target,random_state=42)
# could try 
#X _train, X_test, y_train, y_test = train_test_split(data,target,test_size=0.30, random_state=42)

In [13]:
# Check shape
preproc_fitted = preproc.fit(X_train,y_train)
preproc_fitted.transform(X_train).shape
#time running 16m38s


(67482, 57)

In [14]:
# This code is setting up a cache directory to store the results of the preprocessing step of a pipeline.
# Caching can improve the performance of the pipeline by avoiding repeated computation of the same preprocessing step. 
# The cache directory is created using the mkdtemp function from the tempfile module, 
# and the directory path is stored in the variable cachedir. 
# The directory will be automatically deleted after use by the rmtree function from the shutil module. 
# The use of caching can be controlled by setting the memory parameter of the Pipeline object to cachedir.
allow_grid_searching = False # use True to activate girdsearch in the notebook cells below
# Cache the preprocessing step of the pipeline
cachedir = mkdtemp()


In [None]:
# para avançar: ver rmse and rmse_neg. é valido para o meu caso, visto que a tenho um classification pred
# obejctivo ver todas os passos aplicar e ver os resultados e APRENDER!

### 2.2 Model Iteration ♻

#### a) Model KNN

In [22]:
# It has been chosen this "roc_auc" (Area Under the Curve) scoring metric over other once is general metric, 
# showing how well the model is able to distinguish between two classes across all threshold

In [None]:
# model_knn = KNeighborsClassifier()
# pipe_knn = make_pipeline(preproc, model_knn, memory=cachedir)
# scores_knn = cross_val_score(pipe_knn, X_train, y_train, cv=5, scoring='precision', n_jobs=-1)
# scores_knn.mean()
# 0.3564935064935065

In [None]:
# explanation
###### model #####
model_knn2 = KNeighborsClassifier()
pipe_knn2 = make_pipeline(preproc, model_knn2, memory=cachedir)
scores_knn2 = cross_val_score(pipe_knn2, X_train, y_train, cv=5, scoring="roc_auc", n_jobs=-1)
scores_knn2.mean()

#### b) Model Trees

In [None]:
# max_depth: This parameter sets the maximum depth of each decision tree in the ensemble. 
# A deeper tree can learn more complex relationships between features, but can also lead to overfitting.
# min_samples_leaf: This parameter sets the minimum number of samples required to be at a leaf node of each decision tree in the ensemble.
# A higher value of this parameter can help prevent overfitting by ensuring that each leaf contains a minimum number of samples.
##### Note ##### Note ##### Note #####
# In decision trees, a leaf node is a node that has no child nodes. 
# It represents a decision or prediction that is made by the tree for a particular subset of the data.
# When a decision tree is trained, it recursively splits the data into subsets based on the values of the features, 
# until it reaches a leaf node. At each non-leaf node, the tree makes a decision based on the values of one of the features, 
# and the data is split into two or more subsets based on the decision. This process continues until the tree reaches a stopping criterion, 
# such as a maximum depth or a minimum number of samples per leaf.
# for ou case is max_depth=50 and min_samples_leaf=20, respectively. 
##
###### model #####
model_trees = DecisionTreeClassifier(max_depth=50, min_samples_leaf=20)
pipe_trees = make_pipeline(preproc, model_trees, memory=cachedir)
score_trees = cross_val_score(pipe_trees, X_train, y_train, cv=5, scoring="roc_auc")
score_trees.mean()


#### c) Model Random Forest

In [None]:
# It is another type of supervised machine learning algorithm, but it is used for classification tasks. 
# It is an ensemble method that combines multiple decision trees to create a strong predictive model.
###### model #####
model_randforest = RandomForestClassifier(max_depth=50,min_samples_leaf=20)
pipe_randforest = make_pipeline(preproc, model_randforest, memory=cachedir)
score_randforest = cross_val_score(pipe_randforest, X_train, y_train, cv=5, scoring="roc_auc")
print(score_randforest.std())
score_randforest.mean()

#### d) Model Boosted Trees

In [None]:
# The AdaBoostClassifier is a type of ensemble learning algorithm that combines multiple weak classifiers to create a strong classifier. 
# It works by training a series of base estimators, which are typically simple models with low predictive power, 
# and then iteratively adjusting the weights of misclassified samples to focus on the samples that are most difficult to classify
# we're using the DecisionTreeClassifier as its base estimator, which means that each weak classifier in the ensemble is a decision tree.
# The max_depth=None parameter of the DecisionTreeClassifier means that there is no limit on the maximum depth of the decision trees, 
# so they can grow as deep as necessary to fit the training data.
# Attention!! I am afraid this can lead to a huge running time. IF it does os let's place 30. 
###### model #####
model_ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=None))
pipe_ada = make_pipeline(preproc, model_ada, memory=cachedir)
score_ada = cross_val_score(pipe_ada, X_train, y_train, cv=5, scoring="roc_auc")
print(score_ada.std())
score_ada.mean()

#### e) Model Gradient Boost Classifier 

In [None]:
# supervised machine learning algorithm that is commonly used for classification tasks
# loss: loss function can have a significant impact on the performance of the model. 
# Default loss = "log_loss". We'll hyperparameters to find the best model for our problem.  
# by default learning_rate = 0.01 Setting a lower learning_rate can help prevent overfitting 
# by slowing down the learning process and preventing the model from becoming too complex
# but also can lead to slower convergence. 
# n_estimators: This parameter sets the number of decision trees that will be created in the ensemble
###### model #####
model_gb = GradientBoostingClassifier(n_estimators=100, verbose=0)
pipe_gb = make_pipeline(preproc, model_gb, memory=cachedir)
score_gb = cross_val_score(pipe_gb, X_train, y_train, cv=5, scoring="roc_auc")
print(score_gb.std())
score_gb.mean()

#### f) Model SVC

In [None]:
# explanation
###### model #####
model_SVC = SVC()
pipe_svc = make_pipeline(preproc, model_SVC, memory=cachedir)

In [None]:
# Important to RandomizedSearchCV - param_distributions
# pipe_svc.get_params()

In [None]:
# this is important to 
# preproc.get_params()
model_SVC = SVC()
pipe_svc = make_pipeline(preproc, model_SVC, memory=cachedir)
# Random search 
search = RandomizedSearchCV(
    pipe_svc, 
    param_distributions ={
        'svc__kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__C': uniform(0.1,10)},
    cv=5,
    n_iter = 1,
    scoring="roc_auc")

search.fit(X_train,y_train)

pipe_svc_tuned = search.best_estimator_
pipe_svc_tuned
# for this model to take at it:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier

In [None]:
scores_svc = cross_val_score(pipe_svc_tuned, X_train,y_train,cv=5, scoring="roc_auc")
scores_svc.mean()

#### g) Model Gradient Boost Classifier - fine tuning - boosting!

#### h) Model Gradient Boost Classifier - stacking - boosting!

#### i) XGBoost

In [None]:
# Define numerical feature once-for-all 
# tenho que definir X de acordo com o o meu dataset okok
# 1. tenho de remover o target okok
# 2. verifcar os formatos "inte64" e "float64" okok
# 3 quero obter um dataframe com os nomes das categorias. okok - too much time running 
    ## add:
    # # Get column names after preprocessing
    # column_names = preproc.fit(data).get_feature_names_out()
    # # Create new DataFrame with transformed data and column names
    # data_preproc = pd.DataFrame(preproc_baseline.transform(data), columns=column_names, dtype=np.float16)
    # naming atualizado. 
# 4 remover todo o resto ficar só com a final pipeline okok 
# 5 ver o cutoff, pode ser interensante ter um valor maior okok - too much time running
# 6 ver mais informações sobre y_log e ver se temos uma normal distribution okok 
# 7 Tenho de ver qual é a melhor scoring metric okok 
    ## 1. scoring="roc_auc"
    ## 2. scoring="precision"
    ## 3. scoring="accuracy"
    ## recall, f1, ....
    ## talvez seja bom escrever um texto a explicar cada uma

# seg - 20/03/23
# 8. models explaination 
    ## AdaBoost to be finished
    ## SVC model
# 9. features selection
    ## huge running times 
    ## considering to delete - "worst_account...inv" feature with 70% of missing values. 
# 10 choose the model with the best performance 
    ## if not possible due to running time, start the connection with gcp - virtual machine
    ## it will take the whole week at least. shitty!


In [None]:
# 1. ver melhor os models 
# 2. features selection