In [1]:
import os, copy, time
from pathlib import Path

import pandas as pd
import numpy as np

#https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# We will use multiprocessing for simple additive processes. Further, we will expend for other cases.
# Sources: 
#https://medium.com/@vasista/parallel-processing-with-pandas-c76f88963005
#https://towardsdatascience.com/speeding-up-and-perfecting-your-work-using-parallel-computing-8bc2f0c073f8
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

# home-credit-default-risk tables
if not Path("./application_test.csv").is_file():
  ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
  ! unzip -q data.zip

# default-of-credit-card-clients-dataset
if not Path("./default_ucr.csv").is_file():
  ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv

In [2]:
from fcleaning import (EmptyElim,
                       OutlDetect,                       
)

from fencoding import (Dtimetodata,
                       FEncoding    
)

from preprocessing import (FCleaning, 
                           #FEncoding,
                           FImputation
)

In [3]:
# Read in the source datasets
dict = {
    'datasets':[
                # home-credit-default-risk tables
                pd.read_csv('./application_train.csv'),
                pd.read_csv('./application_test.csv'),
                pd.read_csv('./bureau.csv'),
                #pd.read_csv('./bureau_balance.csv'),
                #pd.read_csv('./POS_CASH_balance.csv'),
                #pd.read_csv('./credit_card_balance.csv'),
                # pd.read_csv('./previous_application.csv'),
                # pd.read_csv('./installments_payments.csv'),

                # default-of-credit-card-clients-datasets
                # pd.read_csv('./default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # home-credit-default-risk tables
                 ['SK_ID_CURR', 'TARGET'],
                 ['SK_ID_CURR'],
                 ['SK_ID_CURR',	'SK_ID_BUREAU'],
                 ['SK_ID_BUREAU'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],

                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']              
    ],   
}

# Keep ID and target columns separately
dict['dropped_columns'] = [dict['datasets'][i][dict['name_dropped_columns'][i]] for i in range(len(dict['datasets']))]

# Drop ID and target columns from the tables
dict['datasets'] = [dict['datasets'][i].drop(dict['name_dropped_columns'][i], axis=1) for i in range(len(dict['datasets']))]

[np.unique([str(dict['datasets'][i][column].dtype) for column in dict['datasets'][i].columns]) for i in range(len(dict['datasets']))]

[array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7')]

In [None]:
X=pd.DataFrame([[1,2, np.nan], [3,4, np.nan], [np.nan, np.nan, 2], [10000, 2, np.nan], [0.1, 3,2], [1,1,2]])
X

Unnamed: 0,0,1,2
0,1.0,2.0,
1,3.0,4.0,
2,,,2.0
3,10000.0,2.0,
4,0.1,3.0,2.0
5,1.0,1.0,2.0


In [None]:
emptyelim = EmptyElim(n_jobs = -1,
                      chunks = None,
                      )
emptyelim.fit(X)


 col_names: {2: [nan, 2.0]}


In [None]:
emptyelim.transform(X)

Unnamed: 0,0,1
0,1.0,2.0
1,3.0,4.0
2,,
3,10000.0,2.0
4,0.1,3.0
5,1.0,1.0


In [None]:
emptyelim.fit_transform(X)


 col_names: {2: [nan, 2.0]}


Unnamed: 0,0,1
0,1.0,2.0
1,3.0,4.0
2,,
3,10000.0,2.0
4,0.1,3.0
5,1.0,1.0


In [None]:
outldetect = OutlDetect(outliers_detection_technique='iqr_proximity_rule', 
                        n_jobs=-1)

In [None]:
outldetect.fit(X)


 col_outl_info: {0: (-2.0, 6.0), 1: (0.5, 4.5), 2: (2.0, 2.0)}


In [None]:
outldetect.transform(X)

Unnamed: 0,0,1,2
0,1.0,2.0,
1,3.0,4.0,
2,,,2.0
3,6.0,2.0,
4,0.1,3.0,2.0
5,1.0,1.0,2.0


In [None]:
outldetect.fit_transform(X)


 col_outl_info: {0: (-2.0, 6.0), 1: (0.5, 4.5), 2: (2.0, 2.0)}


Unnamed: 0,0,1,2
0,1.0,2.0,
1,3.0,4.0,
2,,,2.0
3,6.0,2.0,
4,0.1,3.0,2.0
5,1.0,1.0,2.0


In [None]:
X = dict['datasets'][2][['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'DAYS_CREDIT_UPDATE']]
X

Unnamed: 0,DAYS_CREDIT,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,DAYS_CREDIT_UPDATE
0,-497,-153.0,-153.0,-131
1,-208,1075.0,,-20
2,-203,528.0,,-16
3,-203,,,-16
4,-629,1197.0,,-21
...,...,...,...,...
1716423,-44,-30.0,,-19
1716424,-2648,-2433.0,-2493.0,-2493
1716425,-1809,-1628.0,-970.0,-967
1716426,-1878,-1513.0,-1513.0,-1508


In [None]:
dtimetodata = Dtimetodata(n_jobs=-1, dtime_col_names = {
                                                        'ddays' : ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'DAYS_CREDIT_UPDATE'],
                                                        'dmonths' : [],
                                                        'dyears' : []
                                                      },               
                              time_encode = True, 
                              drop_current = True
                              )

In [None]:
dtimetodata.transform(X)

Unnamed: 0,DAYS_CREDIT_date_year,DAYS_CREDIT_date_month,DAYS_CREDIT_date_day,DAYS_CREDIT_ENDDATE_date_year,DAYS_CREDIT_ENDDATE_date_month,DAYS_CREDIT_ENDDATE_date_day,DAYS_ENDDATE_FACT_date_year,DAYS_ENDDATE_FACT_date_month,DAYS_ENDDATE_FACT_date_day,DAYS_CREDIT_UPDATE_date_year,DAYS_CREDIT_UPDATE_date_month,DAYS_CREDIT_UPDATE_date_day
0,2018,8,22,2019.0,8.0,1.0,2019.0,8.0,1.0,2019,8,23
1,2019,6,7,2022.0,12.0,11.0,,,,2019,12,12
2,2019,6,12,2021.0,6.0,12.0,,,,2019,12,16
3,2019,6,12,,,,,,,2019,12,16
4,2018,4,12,2023.0,4.0,12.0,,,,2019,12,11
...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,2019,11,18,2019.0,12.0,2.0,,,,2019,12,13
1716424,2012,10,1,2013.0,5.0,4.0,2013.0,3.0,5.0,2013,3,5
1716425,2015,1,18,2015.0,7.0,18.0,2017.0,5.0,6.0,2017,5,9
1716426,2014,11,10,2015.0,11.0,10.0,2015.0,11.0,10.0,2015,11,15


In [4]:
X = dict['datasets'][2][['CREDIT_ACTIVE',	'CREDIT_CURRENCY',	'DAYS_CREDIT']]
X

Unnamed: 0,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT
0,Closed,currency 1,-497
1,Active,currency 1,-208
2,Active,currency 1,-203
3,Active,currency 1,-203
4,Active,currency 1,-629
...,...,...,...
1716423,Active,currency 1,-44
1716424,Closed,currency 1,-2648
1716425,Closed,currency 1,-1809
1716426,Closed,currency 1,-1878


In [5]:
fencoding = FEncoding(n_jobs=-1)

In [6]:
fencoding.initialize_types(X)


 DAYS_CREDIT has type int64 and number of unique values: 2923, will be considered as a numerical 



{'categor_columns': ['CREDIT_ACTIVE', 'CREDIT_CURRENCY'],
 'numer_columns': ['DAYS_CREDIT'],
 'time_columns': []}

In [7]:
fencoding.encode_categor(X, method = 'OneHotEncoder')


 DAYS_CREDIT has type int64 and number of unique values: 2923, will be considered as a numerical 



Unnamed: 0,DAYS_CREDIT,CREDIT_ACTIVE_Bad debt,CREDIT_ACTIVE_Closed,CREDIT_ACTIVE_Sold,CREDIT_ACTIVE_nan,CREDIT_CURRENCY_currency 2,CREDIT_CURRENCY_currency 3,CREDIT_CURRENCY_currency 4,CREDIT_CURRENCY_nan
0,-497,0,1,0,0,0,0,0,0
1,-208,0,0,0,0,0,0,0,0
2,-203,0,0,0,0,0,0,0,0
3,-203,0,0,0,0,0,0,0,0
4,-629,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1716423,-44,0,0,0,0,0,0,0,0
1716424,-2648,0,1,0,0,0,0,0,0
1716425,-1809,0,1,0,0,0,0,0,0
1716426,-1878,0,1,0,0,0,0,0,0


In [8]:
fencoding.encode_categor(X, method = 'OrdinalEncoder')


 DAYS_CREDIT has type int64 and number of unique values: 2923, will be considered as a numerical 



Unnamed: 0,DAYS_CREDIT,CREDIT_ACTIVE,CREDIT_CURRENCY
0,-497,2.0,0.0
1,-208,0.0,0.0
2,-203,0.0,0.0
3,-203,0.0,0.0
4,-629,0.0,0.0
...,...,...,...
1716423,-44,0.0,0.0
1716424,-2648,2.0,0.0
1716425,-1809,2.0,0.0
1716426,-1878,2.0,0.0


In [9]:
fencoding.bucket_numerical(X, n_bins=5, columns_to_buck = 'all_numerical', 
                           drop_current = True)


 DAYS_CREDIT has type int64 and number of unique values: 2923, will be considered as a numerical 


 DAYS_CREDIT bucketing ...


Unnamed: 0,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT_bucketed
0,Closed,currency 1,3
1,Active,currency 1,4
2,Active,currency 1,4
3,Active,currency 1,4
4,Active,currency 1,3
...,...,...,...
1716423,Active,currency 1,4
1716424,Closed,currency 1,1
1716425,Closed,currency 1,1
1716426,Closed,currency 1,1


In [13]:
X = dict['datasets'][2]
X = fencoding.encode_categor(X, method = 'OrdinalEncoder')


 DAYS_CREDIT has type int64 and number of unique values: 2923, will be considered as a numerical 


 CREDIT_DAY_OVERDUE has type int64 and number of unique values: 942, will be considered as a numerical 


 DAYS_CREDIT_UPDATE has type int64 and number of unique values: 2982, will be considered as a numerical 



In [14]:
X

Unnamed: 0,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT.1,CREDIT_DAY_OVERDUE.1,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,DAYS_CREDIT_ENDDATE.1,DAYS_ENDDATE_FACT.1,AMT_CREDIT_MAX_OVERDUE,AMT_CREDIT_MAX_OVERDUE.1,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM.1,AMT_CREDIT_SUM_DEBT.1,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,AMT_CREDIT_SUM_LIMIT.1,AMT_CREDIT_SUM_OVERDUE.1,DAYS_CREDIT_UPDATE,DAYS_CREDIT_UPDATE.1,AMT_ANNUITY,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_ACTIVE.1,CREDIT_CURRENCY.1,CNT_CREDIT_PROLONG,CNT_CREDIT_PROLONG.1,CREDIT_TYPE,CREDIT_TYPE.1
0,-497,0,-497,0,-153.0,-153.0,-153.0,-153.0,,,91323.00,0.0,91323.00,0.0,,0.0,,0.0,-131,-131,,2.0,0.0,2.0,0.0,0.0,0.0,3.0,3.0
1,-208,0,-208,0,1075.0,,1075.0,,,,225000.00,171342.0,225000.00,171342.0,,0.0,,0.0,-20,-20,,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0
2,-203,0,-203,0,528.0,,528.0,,,,464323.50,,464323.50,,,0.0,,0.0,-16,-16,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
3,-203,0,-203,0,,,,,,,90000.00,,90000.00,,,0.0,,0.0,-16,-16,,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0
4,-629,0,-629,0,1197.0,,1197.0,,77674.5,77674.5,2700000.00,,2700000.00,,,0.0,,0.0,-21,-21,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,-44,0,-44,0,-30.0,,-30.0,,0.0,0.0,11250.00,11250.0,11250.00,11250.0,0.0,0.0,0.0,0.0,-19,-19,,0.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0
1716424,-2648,0,-2648,0,-2433.0,-2493.0,-2433.0,-2493.0,5476.5,5476.5,38130.84,0.0,38130.84,0.0,0.0,0.0,0.0,0.0,-2493,-2493,,2.0,0.0,2.0,0.0,0.0,0.0,3.0,3.0
1716425,-1809,0,-1809,0,-1628.0,-970.0,-1628.0,-970.0,,,15570.00,,15570.00,,,0.0,,0.0,-967,-967,,2.0,0.0,2.0,0.0,0.0,0.0,3.0,3.0
1716426,-1878,0,-1878,0,-1513.0,-1513.0,-1513.0,-1513.0,,,36000.00,0.0,36000.00,0.0,0.0,0.0,0.0,0.0,-1508,-1508,,2.0,0.0,2.0,0.0,0.0,0.0,3.0,3.0


In [15]:
fimputation = FImputation('tree-based', 
            fill_with_value = 'extreme_values', 
                    n_jobs = 1,
)
s_time = time.time()
fimputation.impute(X)
print('\n n_jobs = 1, time:', time.time() - s_time)

fimputation = FImputation('tree-based', 
            fill_with_value = 'extreme_values', 
                    n_jobs = 2,
)
s_time = time.time()
fimputation.impute(X)
print('\n n_jobs = 2, time:', time.time() - s_time)

ValueError: ignored