In [1]:
import os, copy, time
from pathlib import Path

import pandas as pd
import numpy as np

#https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# We will use multiprocessing for simple additive processes. Further, we will expend for other cases.
# Sources: 
#https://medium.com/@vasista/parallel-processing-with-pandas-c76f88963005
#https://towardsdatascience.com/speeding-up-and-perfecting-your-work-using-parallel-computing-8bc2f0c073f8
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

# home-credit-default-risk tables
if not Path("/content/application_test.csv").is_file():
  ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
  ! unzip -q data.zip

# default-of-credit-card-clients-dataset
if not Path("/content/default_ucr.csv").is_file():
  ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv

from preprocessing import (FCleaning, 
                           FEncoding,
)

In [2]:
# Read in the source datasets
dict = {
    'datasets':[
                # home-credit-default-risk tables
                pd.read_csv('/content/application_train.csv'),
                pd.read_csv('/content/application_test.csv'),
                pd.read_csv('/content/bureau.csv'),
                pd.read_csv('/content/bureau_balance.csv'),
                pd.read_csv('/content/POS_CASH_balance.csv'),
                pd.read_csv('/content/credit_card_balance.csv'),
                # pd.read_csv('/content/previous_application.csv'),
                # pd.read_csv('/content/installments_payments.csv'),

                # default-of-credit-card-clients-datasets
                # pd.read_csv('/content/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # home-credit-default-risk tables
                 ['SK_ID_CURR', 'TARGET'],
                 ['SK_ID_CURR'],
                 ['SK_ID_CURR',	'SK_ID_BUREAU'],
                 ['SK_ID_BUREAU'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],

                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']              
    ],   
}

# Keep ID and target columns separately
dict['dropped_columns'] = [dict['datasets'][i][dict['name_dropped_columns'][i]] for i in range(len(dict['datasets']))]

# Drop ID and target columns from the tables
dict['datasets'] = [dict['datasets'][i].drop(dict['name_dropped_columns'][i], axis=1) for i in range(len(dict['datasets']))]

[np.unique([str(dict['datasets'][i][column].dtype) for column in dict['datasets'][i].columns]) for i in range(len(dict['datasets']))]

[array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['int64', 'object'], dtype='<U6'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7')]

In [3]:
X = dict['datasets'][0]

fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 1,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )

s_time = time.time()
fcleaning.emptyness_elimination(X)
fcleaning.outliers_elimination(X)
print('\n n_jobs = 1, time:', time.time() - s_time)

print('\n CPUs avail:{} \n'.format(mp.cpu_count()))
fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 2,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )

s_time = time.time()
fcleaning.emptyness_elimination(X)
fcleaning.outliers_elimination(X)
print('\n n_jobs = 2, time:', time.time() - s_time)


 n_jobs = 1, time: 6.739346981048584

 CPUs avail:2 


 n_jobs = 2, time: 6.865135669708252


In [4]:
fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 2,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )

for i in range(len(dict['datasets'])):
    X = dict['datasets'][i]
    print('\n dataset N {} | size: {} | emptyness_elimination ...'.format(i, X.shape))
    s_time = time.time()
    fcleaning.emptyness_elimination(X)   
    print('\n dataset N {}, emptyness_elimination completed successfully, time: {}'.format(i, time.time() - s_time))

    s_time = time.time()
    print('\n dataset N {}  | size: {} | outliers_elimination ...'.format(i, X.shape))
    fcleaning.outliers_elimination(X)
    print('\n dataset N {}, outliers_elimination completed successfully, time: {}'.format(i, time.time() - s_time))


 dataset N 0 | size: (307511, 120) | emptyness_elimination ...

 dataset N 0, emptyness_elimination completed successfully, time: 2.7245032787323

 dataset N 0  | size: (307511, 120) | outliers_elimination ...

 dataset N 0, outliers_elimination completed successfully, time: 4.1842169761657715

 dataset N 1 | size: (48744, 120) | emptyness_elimination ...

 FLAG_DOCUMENT_2, unique values: [0]

 FLAG_DOCUMENT_10, unique values: [0]

 FLAG_DOCUMENT_12, unique values: [0]

 FLAG_DOCUMENT_13, unique values: [0]

 FLAG_DOCUMENT_14, unique values: [0]

 FLAG_DOCUMENT_15, unique values: [0]

 FLAG_DOCUMENT_16, unique values: [0]

 FLAG_DOCUMENT_17, unique values: [0]

 FLAG_DOCUMENT_19, unique values: [0]

 FLAG_DOCUMENT_20, unique values: [0]

 FLAG_DOCUMENT_21, unique values: [0]

 dataset N 1, emptyness_elimination completed successfully, time: 0.6694204807281494

 dataset N 1  | size: (48744, 120) | outliers_elimination ...

 dataset N 1, outliers_elimination completed successfully, time

In [5]:
X = dict['datasets'][2]

fencoding = FEncoding(n_jobs = 1, 
                      chunks = None, 
                      path = None,)
s_time = time.time()
fencoding.dtime_to_data(X, 
                        dtime_col_names = {
                                              'ddays' : ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'DAYS_CREDIT_UPDATE'],
                                              'dmonths' : [],
                                              'dyears' : []
                                           },               
                        time_encode = True, 
                        drop_current = True
                     )
f_dict = fencoding.pick_categor(X)
print('\n f_dict:', f_dict)
fencoding.bucket_numerical(X, 
                         n_bins=5, 
                         columns_to_buck = 'all_numerical', 
                         drop_current = True)

fencoding.encode_categor(X, method = 'OrdinalEncoder')
print('\n n_jobs = 1, time:', time.time() - s_time)

fencoding = FEncoding(n_jobs = 2, 
                      chunks = None, 
                      path = None,)
s_time = time.time()
fencoding.dtime_to_data(X, 
                        dtime_col_names = {
                                              'ddays' : ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'DAYS_CREDIT_UPDATE'],
                                              'dmonths' : [],
                                              'dyears' : []
                                           },               
                        time_encode = True, 
                        drop_current = True
                     )
f_dict = fencoding.pick_categor(X)
print('\n f_dict:', f_dict)
fencoding.bucket_numerical(X, 
                         n_bins=5, 
                         columns_to_buck = 'all_numerical', 
                         drop_current = True)

fencoding.encode_categor(X, method = 'OrdinalEncoder')
print('\n n_jobs = 1, time:', time.time() - s_time)


 DAYS_CREDIT processed ...

 DAYS_CREDIT_ENDDATE processed ...

 DAYS_ENDDATE_FACT processed ...

 DAYS_CREDIT_UPDATE processed ...

 DAYS_CREDIT_date was encoded

 DAYS_CREDIT_ENDDATE_date was encoded

 DAYS_ENDDATE_FACT_date was encoded

 DAYS_CREDIT_UPDATE_date was encoded

 f_dict: {'categor_columns': ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'CNT_CREDIT_PROLONG', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE'], 'numer_columns': ['DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'AMT_ANNUITY'], 'time_columns': []}

 DAYS_CREDIT_ENDDATE bucketing ...

 DAYS_ENDDATE_FACT bucketing ...

 AMT_CREDIT_MAX_OVERDUE bucketing ...

 AMT_CREDIT_SUM bucketing ...

 AMT_CREDIT_SUM_DEBT bucketing ...

 AMT_CREDIT_SUM_LIMIT bucketing ...

 AMT_CREDIT_SUM_OVERDUE bucketing ...

 AMT_ANNUITY bucketing ...

 n_jobs = 1, time: 19.419448852539062

 DAYS_CREDIT processed 

In [None]:
class FImputation(object):
    '''
      Dealing with missing values:
      We will use simple techniques with regards to the model that we use.
      For tree-based models, nana will be filled in with max values (or zeros)
      For regression with means and medians for numerical and categorical types respectively.
    '''

    def __init__(self, model_type, fill_with_value = None, 
                    n_jobs = 1, chunks = None, 
                    path = None,
                    ):
        
        self.model_type = model_type

        self.fill_with_value = fill_with_value
        
        self.n_jobs = n_jobs
        self.chunks = chunks
        self.path = path


    def impute_(self, X):
        if self.model_type == 'tree-based':                         
            imputer = SimpleImputer(missing_values=[np.nan], # what else?
                                    strategy='constant'
                                    )
            if self.fill_with_value == 'zeros':
                #imputer.set_params(fill_value = 0)
                #return imputer.fit_transform(X)
                X.fillna(0, inplace=True)                        
                return X

            elif self.fill_with_value == 'extreme_values':
                for column in X.columns:
                    #imputer.set_params(fill_value = X['AMT_INCOME_TOTAL'][abs(X['AMT_INCOME_TOTAL']) == abs(X['AMT_INCOME_TOTAL']).max()].item())
                    #X[column] = imputer.fit_transform(np.array(X[column].values).reshape(-1,1))
                    X[column].fillna( X['AMT_INCOME_TOTAL'][abs(X['AMT_INCOME_TOTAL']) == abs(X['AMT_INCOME_TOTAL']).max()].item(), inplace=True)
                return X
            else:
                raise VlaueError('Identify fill_with_value parameter')

        if self.model_type == 'regression-based':
            #TODO
            strategies = ['mean', 'median']
            col_types = []

    def impute(self, X):
        if self.chunks == None:
            self.chunks  = int(len(X.columns)/self.n_jobs)
            p = Pool(processes = self.n_jobs)
            X =  pd.concat(p.map(self.impute_, 
                                    [X[list(X.columns)[start: start + self.chunks]] for start in range(0, len(X.columns), self.chunks)]
                                    ), axis=1)
            if self.path != None:
                  X.to_csv(self.path)
        return X

In [None]:
fimputation = FImputation('tree-based',
                          fill_with_value='zeros',
                          n_jobs=1
                         )
s_time = time.time()
fimputation.impute(X)
time.time() - s_time

4.768680095672607

In [None]:
fimputation = FImputation('tree-based',
                          fill_with_value='zeros',
                          n_jobs=2
                         )
s_time = time.time()
fimputation.impute(X)
time.time() - s_time

4.519124746322632

In [None]:
class FScaling(object):
  def __init__(self, scaler):
    self.scaler = scaler

  def scale(X):

    return X
