In [1]:
!nvidia-smi

Fri Nov 20 17:57:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:1A:00.0 Off |                    0 |
| N/A   37C    P0    69W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:1C:00.0 Off |                    0 |
| N/A   33C    P0    70W / 300W |      0MiB / 16160MiB |      0%      Default |
|       

In [2]:
# Standard Libraries
import os
import glob
import shutil
import nvidia_smi

# External Dependencies
import cupy as cp
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
import dask
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import dask.dataframe as dd
import rmm

from pathlib import Path
import pandas as pd
import numpy as np

# NVTabular
import nvtabular as nvt
import nvtabular.ops as ops
from nvtabular.io import Shuffle
from nvtabular.utils import device_mem_size

import warnings
warnings.filterwarnings('ignore')

import logging

In [3]:
from fencoding_GPUs import set_cluster_client
client = set_cluster_client(n_gpus=-1, device_spill_frac=0.8)

Dask Version: 2.30.0
Dask cuDF Version: 0.16.0


 n_gpus_avail: 2


0,1
Client  Scheduler: tcp://127.0.0.1:34809  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 404.32 GB



 Dashboard avail: http://localhost:8888/proxy/8787/status


In [4]:
# home-credit-default-risk tables
if not Path("data/application_test.csv").is_file():
    %cd data
    ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
    ! unzip -q data.zip
    ! rm data.zip
# default-of-credit-card-clients-dataset
if not Path("data/default_ucr.csv").is_file():
    %cd data
    ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv
    %cd ..

In [5]:
# Read in the source datasets
dict_ = {
    'datasets':[
                # default-of-credit-card-clients-datasets
                pd.read_csv('./data/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']
                 #['default.payment.next.month'] # 'ID' is needed for shuffling
    ],   
}

In [6]:
N_D = 0
X= dict_['datasets'][N_D]
X

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [32]:
class FEncoding_advanced(object):   
    def __init__(self, client, rest_col_names=[], y_names=[], filename=None):        
        self.filename = filename
        self.rest_col_names = rest_col_names
        self.y_names = y_names
        self.n_gpus = len(client.nthreads())
        self.client = client
        self.output_path="./parquet_data_tmp"
        self.categor_types = ['category', 'object', 'bool', 'int32', 'int64', 'int8']
        self.numer_types = ['float', 'float32', 'float64']
        self.time_types = ['datetime64[ns]', 'datetime64[ns, tz]'] 
        # What else? https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
        # TODO: check if there are any other time types
        
    def elim_empty_columns(self, X, save_to_csv = False):
        # GPU version
        ddf = dd.from_pandas(X, npartitions=self.n_gpus)
        cols_to_drop = []
        for column in ddf.columns:
            if len(ddf[column].unique().compute().values) < 2:
                cols_to_drop.append(column)
        print('\n dropped columns:', cols_to_drop)
        X_final = ddf.drop(cols_to_drop, axis=1).compute()  
        if filename is not None:
            if save_to_csv:
                X_final.to_csv('./data/' + self.filename, index=False)
        return X_final
    
    def initialize_types(self, X, return_dtype=False, save_to_pkl = False, dict_name = 'out_dict.pkl'):
        # GPU version
        X = dd.from_pandas(X, npartitions=self.n_gpus)       
        self.categor_columns, self.numer_columns, self.time_columns = [], [], []
        # Sometimes categorical feature can be presented with a float type. Let's check for that
        f_columns_names =[x for x in list(X.columns)  if x not in self.rest_col_names + self.y_names]
        for column in f_columns_names:
            c_type = str(X[column].dtype) 
            if any(c_type == t for t in self.numer_types):    
                uvs = cp.array(X[column].unique().compute())
                unique_values = list(uvs[~cp.isnan(uvs)])
                if cp.array([el.item().is_integer() for el in unique_values]).sum() == len(unique_values):
                    #print('\n {} has type {} and number of unique values: {}, will be considered as a categorical \n'.format(column, c_type, len(unique_values)))
                    #logging.info(f"{column} has type {c_type} and number of unique values: {len(unique_values)}, will be considered as a categorical")
                    self.categor_columns.append(column)
                else:
                    self.numer_columns.append(column)
            if any(c_type == t for t in self.categor_types):
                self.categor_columns.append(column)
            if any(c_type == t for t in self.time_types):
                self.time_columns.append(column)                             
        out_dict =  {'categor_columns': self.categor_columns,
                'numer_columns': self.numer_columns,
                'time_columns': self.time_columns,                    
         }
        if return_dtype:
            out_dict.update(
                {'categor_columns_dtypes': [str(X[self.categor_columns].dtypes.values[i]) for i in range(len(self.categor_columns))],
                 'numer_columns_dtypes': [str(X[self.numer_columns].dtypes.values[i]) for i in range(len(self.numer_columns))],
                 'time_columns_dtypes': [str(X[self.time_columns].dtypes.values[i]) for i in range(len(self.time_columns))],                    
             })            
        if save_to_pkl:
            output = open('./data/' + dict_name, 'wb')
            pickle.dump(out_dict, output)
            output.close()
        return out_dict
    
    def date_replace(self, X, save_to_csv = False):
        # GPU version
        return X
    
    def outldetect(self, outliers_detection_technique, X_num):
        # GPU version
        X_num = dd.from_pandas(X_num, npartitions=self.n_gpus)
        if outliers_detection_technique == 'iqr_proximity_rule':             
            IQR = (X_num.quantile(0.75).sub(X_num.quantile(0.25)))
            lower = X_num.quantile(0.25).sub(IQR*1.5)
            upper = X_num.quantile(0.75).sub(IQR*1.5)            
        if outliers_detection_technique == 'gaussian_approximation':
            lower = X_num.mean().sub(3 * X_num.std())
            upper = X_num.mean().add(3 * X_num.std())        
        if outliers_detection_technique == 'quantiles':
            lower = X_num.quantile(0.10)
            upper = X_num.quantile(0.90)        
        return list(lower.compute()), list(upper.compute())          
        
    def processing(self, X, 
                   outliers_detection_technique = 'iqr_proximity_rule', #'gaussian_approximation','quantiles'
                   fill_with_value = 'zeros', #'extreme_values', #'mean-median'
                   encoding_method = 'OrdinalEncoder', #'OneHotEncoder'
                   save_to_csv = False,
                  ):
        
        self.initialize_types(X)
        
        workflow = nvt.Workflow(cat_names=self.categor_columns, 
                        cont_names=self.numer_columns,
                        label_name=self.y_names,
                        client=self.client
                       )
        
        # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html      
        # Categorify https://nvidia.github.io/NVTabular/main/api/ops/categorify.html
        if len(self.categor_columns) != 0:
            workflow.add_preprocess(
                ops.Categorify(columns = self.categor_columns, out_path='./data/')
            )
        # OutlDetect https://nvidia.github.io/NVTabular/main/api/ops/clip.html
        lower, upper = self.outldetect(outliers_detection_technique, X[self.numer_columns])
        for i in range(len(self.numer_columns)):
            workflow.add_preprocess(
                ops.Clip(min_value=lower[i], max_value=upper[i], columns=[self.numer_columns[i]])
            )
        
        # FillMissing https://nvidia.github.io/NVTabular/main/api/ops/fillmissing.html
        if fill_with_value = 'zeros':
            workflow.add_preprocess(
                ops.FillMissing(fill_val=0, columns=self.categor_columns + self.numer_columns)
            )
        if fill_with_value = 'extreme_values':
            f_columns = self.categor_columns + self.numer_columns
            

            
            for i in range(len(f_columns)):
                workflow.add_preprocess(
                    ops.FillMissing(fill_val=extrim_values[i], columns=[f_columns[i]])
                )
                
            
        
        
        
        

        #workflow.add_preprocess(
             
            #ops.Clip(0, 10, columns=)#min_value=None, max_value=None, columns=f_dict['numer_columns'], replace=True)

            #TODO: change in encode_categor
            #ops.TargetEncoding(cat_groups=f_dict['categor_columns'],
             #                  cont_target=None),

            #TODO: chenge in tree-based models, nana will be filled in with max values (or zeros)
            #ops.FillMissing(fill_val=0, columns=f_dict['categor_columns'] + f_dict['numer_columns'], replace=True),
        #)

        #workflow.add_preprocess(
        #    ops.Categorify(10)
        #)

        #workflow.add_preprocess(
        #    ops.FillMedian()#columns=f_dict['categor_columns'], preprocessing=True, replace=True)

        #)     
        
        #######################################################        
        workflow.finalize()
        dataset = nvt.Dataset(X)
        tmp_output_path="./parquet_data_tmp"
        workflow.apply(
             dataset,
             output_format="parquet",
             output_path=tmp_output_path,
             shuffle=Shuffle.PER_WORKER,  # Shuffle algorithm
             out_files_per_proc=8, # Number of output files per worker
             )
        files = glob.glob(tmp_output_path + "/*.parquet")
        X_final = cudf.read_parquet(files[0])
        for i in range(1, len(files)):    
            X_final = X_final.append(cudf.read_parquet(files[i]))      
        
        # Delete temporary files
        shutil.rmtree(tmp_output_path, ignore_errors=True)
        shutil.rmtree('dask-worker-space', ignore_errors=True)
        
        if save_to_csv:
            try:
                X_final.to_csv('./data/' + self.filename, index=False)
            except TypeError:
                print('Initialize filename!')
        return X_final 

In [33]:
X = pd.DataFrame(np.random.random((10,10)))
X.columns = [str(i) for i in range(10)]

In [34]:
X.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='object')

In [35]:
fencoding = FEncoding_advanced(client)
fencoding.rest_col_names = ['0']# ['ID']
fencoding.y_names =['3']# ['default.payment.next.month']

# Ready, TODO: check for correctness 
#fencoding.elim_empty_columns(X)
#fencoding.initialize_types(X, return_dtype=True)

In [36]:
fencoding.processing(X, save_to_csv = False)

Unnamed: 0,1,2,4,5,6,7,8,9,3
0,0.606339,0.452833,0.745663,0.191319,-0.035709,-0.118724,0.072983,0.422062,0.676319
1,0.606339,0.452833,0.740056,0.409532,-0.035709,-0.118724,0.072983,0.422062,0.701615
0,0.606339,0.401691,0.740056,0.409532,-0.035709,-0.118724,0.072983,0.422062,0.682129
0,0.516157,0.417332,0.745663,0.409532,-0.035709,-0.118724,0.072983,0.212397,0.892864
1,0.606339,0.452833,0.740056,0.409532,-0.035709,-0.118724,0.072983,0.212397,0.467002
0,0.606339,0.452833,0.740056,0.409532,-0.035709,-0.118724,0.072983,0.422062,0.093468
0,0.606339,0.331425,0.745663,0.409532,-0.035709,-0.118724,0.072983,0.422062,0.408643
1,0.516157,0.452833,0.740056,0.210842,-0.035709,-0.118724,0.072983,0.422062,0.900333
0,0.606339,0.39648,0.745663,0.409532,-0.035709,-0.118724,0.072983,0.422062,0.141202
1,0.516157,0.331425,0.745663,0.191319,-0.035709,-0.118724,0.072983,0.422062,0.564674


In [131]:
X = pd.DataFrame(
    [
        [1,2.2,5.4, 0, 'a','234'],
        [0.2, 5,8, 9.2, 'a', '4s'],
        [-3,4,6,2,'q', 'wd'],
        [-3,4,5,6,'w', 'se']
    
    ]
)
X

Unnamed: 0,0,1,2,3,4,5
0,1.0,2.2,5.4,0.0,a,234
1,0.2,5.0,8.0,9.2,a,4s
2,-3.0,4.0,6.0,2.0,q,wd
3,-3.0,4.0,5.0,6.0,w,se


In [132]:
dataset = dd.from_pandas(X[[0,1]], npartitions=2) 
dataset

Unnamed: 0_level_0,0,1
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,float64,float64
2,...,...
3,...,...


In [130]:
a = cp.array(dataset.where(dataset.abs() == dataset.abs().max()).values.compute()).T
cp.unique(a[0,:])[0].item()

-3.0

In [133]:
dataset = dd.from_pandas(X[[4,5]], npartitions=2) 
dataset

Unnamed: 0_level_0,4,5
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,object,object
2,...,...
3,...,...


Dask Series Structure:
npartitions=2
0    object
2       ...
3       ...
Name: 4, dtype: object
Dask Name: getitem, 4 tasks