In [None]:
import os, copy, time
from pathlib import Path

import pandas as pd
import numpy as np

#https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# We will use multiprocessing for simple additive processes. Further, we will expend for other cases.
# Sources: 
#https://medium.com/@vasista/parallel-processing-with-pandas-c76f88963005
#https://towardsdatascience.com/speeding-up-and-perfecting-your-work-using-parallel-computing-8bc2f0c073f8
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

import warnings
warnings.filterwarnings('ignore')

# home-credit-default-risk tables
if not Path("/content/application_test.csv").is_file():
  ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
  ! unzip -q data.zip

# default-of-credit-card-clients-dataset
if not Path("/content/default_ucr.csv").is_file():
  ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv

from preprocessing import (FCleaning,   
                          )

In [None]:
# Read in the source datasets
dict = {
    'datasets':[
                # home-credit-default-risk tables
                pd.read_csv('/content/application_train.csv'),
                pd.read_csv('/content/application_test.csv'),
                pd.read_csv('/content/bureau.csv'),
                pd.read_csv('/content/bureau_balance.csv'),
                pd.read_csv('/content/POS_CASH_balance.csv'),
                pd.read_csv('/content/credit_card_balance.csv'),
                # pd.read_csv('/content/previous_application.csv'),
                # pd.read_csv('/content/installments_payments.csv'),

                # default-of-credit-card-clients-datasets
                # pd.read_csv('/content/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # home-credit-default-risk tables
                 ['SK_ID_CURR', 'TARGET'],
                 ['SK_ID_CURR'],
                 ['SK_ID_CURR',	'SK_ID_BUREAU'],
                 ['SK_ID_BUREAU'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],
                 ['SK_ID_PREV',	'SK_ID_CURR'],

                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']              
    ],   
}

# Keep ID and target columns separately
dict['dropped_columns'] = [dict['datasets'][i][dict['name_dropped_columns'][i]] for i in range(len(dict['datasets']))]

# Drop ID and target columns from the tables
dict['datasets'] = [dict['datasets'][i].drop(dict['name_dropped_columns'][i], axis=1) for i in range(len(dict['datasets']))]

In [None]:
[np.unique([str(dict['datasets'][i][column].dtype) for column in dict['datasets'][i].columns]) for i in range(len(dict['datasets']))]

[array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['int64', 'object'], dtype='<U6'),
 array(['float64', 'int64', 'object'], dtype='<U7'),
 array(['float64', 'int64', 'object'], dtype='<U7')]

In [None]:
X = dict['datasets'][0]

In [None]:
fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 1,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )

s_time = time.time()
fcleaning.emptyness_elimination(X)
fcleaning.outliers_elimination(X)
time.time() - s_time

13.082415342330933

In [None]:
print('\n CPUs avail:{} \n'.format(mp.cpu_count()))
fcleaning = FCleaning('iqr_proximity_rule', 
                      n_jobs = 2,
                      chunks = None,
                      #path = './trial1.csv', #None,
                      )

s_time = time.time()
fcleaning.emptyness_elimination(X)
fcleaning.outliers_elimination(X)
time.time() - s_time


 CPUs avail:2 



10.717444658279419

In [None]:
for i in range(len(dict['datasets'])):
    X = dict['datasets'][i]
    print('\n dataset N {} | size: {} | emptyness_elimination ...'.format(i, X.shape))
    s_time = time.time()
    fcleaning.emptyness_elimination(X)   
    print('\n dataset N {}, emptyness_elimination completed successfully, time: {}'.format(i, time.time() - s_time))

    s_time = time.time()
    print('\n dataset N {}  | size: {} | outliers_elimination ...'.format(i, X.shape))
    fcleaning.outliers_elimination(X)
    print('\n dataset N {}, outliers_elimination completed successfully, time: {}'.format(i, time.time() - s_time))


 dataset N 0 | size: (307511, 120) | emptyness_elimination ...

 dataset N 0, emptyness_elimination completed successfully, time: 4.282975196838379

 dataset N 0  | size: (307511, 120) | outliers_elimination ...

 dataset N 0, outliers_elimination completed successfully, time: 6.613495349884033

 dataset N 1 | size: (48744, 120) | emptyness_elimination ...

 FLAG_DOCUMENT_2, unique values: [0]

 FLAG_DOCUMENT_10, unique values: [0]

 FLAG_DOCUMENT_12, unique values: [0]

 FLAG_DOCUMENT_13, unique values: [0]

 FLAG_DOCUMENT_14, unique values: [0]

 FLAG_DOCUMENT_15, unique values: [0]

 FLAG_DOCUMENT_16, unique values: [0]

 FLAG_DOCUMENT_17, unique values: [0]

 FLAG_DOCUMENT_19, unique values: [0]

 FLAG_DOCUMENT_20, unique values: [0]

 FLAG_DOCUMENT_21, unique values: [0]

 dataset N 1, emptyness_elimination completed successfully, time: 1.0747668743133545

 dataset N 1  | size: (48744, 120) | outliers_elimination ...

 dataset N 1, outliers_elimination completed successfully, tim

In [None]:
class FImputation(object):
    '''
      Dealing with missing values:
      We will use simple techniques with regards to the model that we use.
      For tree-based models, nana will be filled in with max values (or zeros)
      For regression with means and medians for numerical and categorical types respectively.
    '''

    def __init__(self, model_type, fill_with_value = None, 
                    n_jobs = 1, chunks = None, 
                    path = None,
                    ):
        
        self.model_type = model_type

        self.fill_with_value = fill_with_value
        
        self.n_jobs = n_jobs
        self.chunks = chunks
        self.path = path


    def impute_(self, X):
        if self.model_type == 'tree-based':                         
            imputer = SimpleImputer(missing_values=[np.nan], # what else?
                                    strategy='constant'
                                    )
            if self.fill_with_value == 'zeros':
                #imputer.set_params(fill_value = 0)
                #return imputer.fit_transform(X)
                X.fillna(0, inplace=True)                        
                return X

            elif self.fill_with_value == 'extreme_values':
                for column in X.columns:
                    #imputer.set_params(fill_value = X['AMT_INCOME_TOTAL'][abs(X['AMT_INCOME_TOTAL']) == abs(X['AMT_INCOME_TOTAL']).max()].item())
                    #X[column] = imputer.fit_transform(np.array(X[column].values).reshape(-1,1))
                    X[column].fillna( X['AMT_INCOME_TOTAL'][abs(X['AMT_INCOME_TOTAL']) == abs(X['AMT_INCOME_TOTAL']).max()].item(), inplace=True)
                return X
            else:
                raise VlaueError('Identify fill_with_value parameter')

        if self.model_type == 'regression-based':
            #TODO
            strategies = ['mean', 'median']
            col_types = []

    def impute(self, X):
        if self.chunks == None:
            self.chunks  = int(len(X.columns)/self.n_jobs)
            p = Pool(processes = self.n_jobs)
            X =  pd.concat(p.map(self.impute_, 
                                    [X[list(X.columns)[start: start + self.chunks]] for start in range(0, len(X.columns), self.chunks)]
                                    ), axis=1)
            if self.path != None:
                  X.to_csv(self.path)
        return X

In [None]:
fimputation = FImputation('tree-based',
                          fill_with_value='zeros',
                          n_jobs=1
                         )
s_time = time.time()
fimputation.impute(X)
time.time() - s_time

4.768680095672607

In [None]:
fimputation = FImputation('tree-based',
                          fill_with_value='zeros',
                          n_jobs=2
                         )
s_time = time.time()
fimputation.impute(X)
time.time() - s_time

4.519124746322632

In [None]:
class FEncoding(object):
    def __init__(self, 
                 n_jobs = 1, chunks = None, 
                 path = None,
                 ):
      
        self.n_jobs = n_jobs
        self.chunks = chunks
        self.path = path

        self.categor_types = ['object', 'bool', 'int32', 'int64']
        self.numer_types = ['float', 'float32', 'float64']

        self.categor_columns = []
        self.numer_columns = []
    
    def dtime_to_data_(self, X, 
                       dtime_col_names = {
                           'ddays' : [],
                           'dmonths' : [],
                           'dyears' : []
                           },
                        
                       start_date = '2020-01-01', 
                       time_encode = False, 
                       drop_current = False):
      
        # Time index
        start_date = pd.Timestamp(start_date)

        for delays, k in zip([list(dtime_col_names.keys()), [1, 30, 365]]): 
            for column in dtime_col_names['ddays']:
                X[column + '_date'] = start_date + pd.to_timedelta(X[column]*k, 'D')
                if drop_current:
                    X.drop(columns=[column], inplace=True)

        if time_encode:        
            for column in X.columns:
              if str(X[column].dtype) == 'datetime64[ns]': 
                  # TODO: check if there are any other time types
                  # datetime64[ns] -> int64
                  X[column + '_year'] = X[column].dt.year
                  X[column + '_month'] = X[column].dt.month
                  X[column + '_day'] = X[column].dt.day
                  if drop_current:
                        X.drop(columns=[column], inplace=True)
        return X    
    
    def pick_categor_(self, X):
        # Sometimes categorical feature can be presented with a float type. 
        # Let's check for that
        n_unique = 50
        len_X = len(X)
        unique_values_X = X[column].unique()
        for column in X.columns:
            c_type = str(X[column].dtype) 
            
            if any(c_type == t for t in self.numer_types) & (len(unique_values_X) < percent_unique):
                print('\n {} has type {} and unique values: {} -> {}, will be considered as categorical \n'.format(column, c_type, unique_values_X))
                self.categor_columns.append(column)

            elif any(c_type == t for t in self.categor_types):
                self.categor_columns.append(column)

            else:
                self.numer_columns.append(column)
    
    def bucket_numerical_(self, X, n_bins=5, drop_current = False):
        # TODO: specify or introduce a criterion which columns to bake
        # K-bins discretization
        discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal') 
        for column in X.columns:
            if any(column == col for col in self.numer_columns):
                X[column + '_bucketed'] = discretizer.fit_transform(np.array(X[column].values).reshape(-1,1))
            if drop_current:
                X.drop(columns=[column], inplace=True)

        return X

    def encode_categor_(self, X):

      
        return X


    def dtime_to_data(self, X, dtime_col_names, start_day, time_encode = False):

        return X

    def pick_categor(self, X):
        return X
    
    def bucket_numerical(self, X):
        return X

    def encode_categor(self, X):
        return X


In [None]:
pd.qcut(X[['AMT_REQ_CREDIT_BUREAU_HOUR']].values.ravel(), 3, labels=["good", "medium", "bad"], duplicates='drop')

ValueError: ignored

In [None]:
class FScaling(object):
  def __init__(self, scaler):
    self.scaler = scaler

  def scale(X):

    return X
