In [4]:
import os, copy, time
from pathlib import Path

import pandas as pd
import numpy as np

#https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# We will use multiprocessing for simple additive processes. Further, we will expend for other cases.
# Sources: 
#https://medium.com/@vasista/parallel-processing-with-pandas-c76f88963005
#https://towardsdatascience.com/speeding-up-and-perfecting-your-work-using-parallel-computing-8bc2f0c073f8
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

# home-credit-default-risk tables
if not Path("./application_test.csv").is_file():
  ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
  ! unzip -q data.zip

# default-of-credit-card-clients-dataset
if not Path("./default_ucr.csv").is_file():
  ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv

In [6]:
from preprocessing import (FCleaning, 
                           FEncoding,
                           FImputation
)

In [5]:
from fcleaning import (EmptyElim,
                       
)

In [None]:
# Read in the source datasets
dict = {
    'datasets':[
                # home-credit-default-risk tables
                pd.read_csv('/content/application_train.csv'),
                pd.read_csv('/content/application_test.csv'),
                pd.read_csv('/content/bureau.csv'),
                pd.read_csv('/content/bureau_balance.csv'),
                pd.read_csv('/content/POS_CASH_balance.csv'),
                pd.read_csv('/content/credit_card_balance.csv'),
                # pd.read_csv('/content/previous_application.csv'),
                # pd.read_csv('/content/installments_payments.csv'),

                # default-of-credit-card-clients-datasets
                # pd.read_csv('/content/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # home-credit-default-risk tables
                 ['SK_ID_CURR', 'TARGET'],
                 ['SK_ID_CURR'],
                 ['SK_ID_CURR',	'SK_ID_BUREAU'],
                 ['SK_ID_BUREAU'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],

                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']              
    ],   
}

# Keep ID and target columns separately
dict['dropped_columns'] = [dict['datasets'][i][dict['name_dropped_columns'][i]] for i in range(len(dict['datasets']))]

# Drop ID and target columns from the tables
dict['datasets'] = [dict['datasets'][i].drop(dict['name_dropped_columns'][i], axis=1) for i in range(len(dict['datasets']))]

[np.unique([str(dict['datasets'][i][column].dtype) for column in dict['datasets'][i].columns]) for i in range(len(dict['datasets']))]

[array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['int64', 'object'], dtype='<U6'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7')]

In [None]:
X = dict['datasets'][0]
fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 1,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )
s_time = time.time()
fcleaning.emptyness_elimination(X)
fcleaning.outliers_elimination(X)
print('\n n_jobs = 1, time:', time.time() - s_time)

print('\n CPUs avail:{}'.format(mp.cpu_count()))
fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 2,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )
s_time = time.time()
fcleaning.emptyness_elimination(X)
fcleaning.outliers_elimination(X)
print('\n n_jobs = 2, time:', time.time() - s_time)


 n_jobs = 1, time: 6.621775150299072

 CPUs avail:2 


 n_jobs = 2, time: 6.631769895553589


In [None]:
X = dict['datasets'][2]
fencoding = FEncoding(n_jobs = 1, 
                      chunks = None, 
                      path = None,)
s_time = time.time()
fencoding.dtime_to_data(X, 
                        dtime_col_names = {'ddays' : ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'DAYS_CREDIT_UPDATE'],
                                           'dmonths' : [],
                                           'dyears' : []
                                           },               
                        time_encode = True, 
                        drop_current = True
                     )
f_dict = fencoding.pick_categor(X)
print('\n f_dict:', f_dict)
fencoding.bucket_numerical(X, 
                         n_bins=5, 
                         columns_to_buck = 'all_numerical', 
                         drop_current = True)

fencoding.encode_categor(X, method = 'OrdinalEncoder')
print('\n n_jobs = 1, time:', time.time() - s_time)

fencoding = FEncoding(n_jobs = 2, 
                      chunks = None, 
                      path = None,)
s_time = time.time()
fencoding.dtime_to_data(X, 
                        dtime_col_names = {   'ddays' : ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'DAYS_CREDIT_UPDATE'],
                                              'dmonths' : [],
                                              'dyears' : []
                                           },               
                        time_encode = True, 
                        drop_current = True
                     )
f_dict = fencoding.pick_categor(X)
print('\n f_dict:', f_dict)
fencoding.bucket_numerical(X, 
                         n_bins=5, 
                         columns_to_buck = 'all_numerical', 
                         drop_current = True)

fencoding.encode_categor(X, method = 'OrdinalEncoder')
print('\n n_jobs = 1, time:', time.time() - s_time)


 DAYS_CREDIT processed ...

 DAYS_CREDIT_ENDDATE processed ...

 DAYS_ENDDATE_FACT processed ...

 DAYS_CREDIT_UPDATE processed ...

 DAYS_CREDIT_date was encoded

 DAYS_CREDIT_ENDDATE_date was encoded

 DAYS_ENDDATE_FACT_date was encoded

 DAYS_CREDIT_UPDATE_date was encoded

 f_dict: {'categor_columns': ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'CNT_CREDIT_PROLONG', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE'], 'numer_columns': ['DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'AMT_ANNUITY'], 'time_columns': []}

 DAYS_CREDIT_ENDDATE bucketing ...

 DAYS_ENDDATE_FACT bucketing ...

 AMT_CREDIT_MAX_OVERDUE bucketing ...

 AMT_CREDIT_SUM bucketing ...

 AMT_CREDIT_SUM_DEBT bucketing ...

 AMT_CREDIT_SUM_LIMIT bucketing ...

 AMT_CREDIT_SUM_OVERDUE bucketing ...

 AMT_ANNUITY bucketing ...

 n_jobs = 1, time: 18.792246341705322

 DAYS_CREDIT processed 

In [None]:
X = dict['datasets'][2]
X = fencoding.encode_categor(X, method = 'OrdinalEncoder')

fimputation = FImputation('tree-based', 
            fill_with_value = 'extreme_values', 
                    n_jobs = 1,
)
s_time = time.time()
fimputation.impute(X)
print('\n n_jobs = 1, time:', time.time() - s_time)

fimputation = FImputation('tree-based', 
            fill_with_value = 'extreme_values', 
                    n_jobs = 2,
)
s_time = time.time()
fimputation.impute(X)
print('\n n_jobs = 2, time:', time.time() - s_time)


 n_jobs = 1, time: 1.8499367237091064

 n_jobs = 2, time: 1.375610113143921


In [7]:
X=pd.DataFrame([[1,2, np.nan], [3,4, np.nan], [np.nan, np.nan, 2]])
X

Unnamed: 0,0,1,2
0,1.0,2.0,
1,3.0,4.0,
2,,,2.0


In [9]:
emptynesselimination = EmptyElim(n_jobs = 2,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )
emptynesselimination.fit(X)


 col_names: {2: [nan, 2.0]}


In [10]:
emptynesselimination.transform(X)

Unnamed: 0,0,1
0,1.0,2.0
1,3.0,4.0
2,,


In [11]:
emptynesselimination.fit_transform(X)


 col_names: {2: [nan, 2.0]}


Unnamed: 0,0,1
0,1.0,2.0
1,3.0,4.0
2,,
