In [1]:
!nvidia-smi

Fri Nov 20 13:39:37 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:1D:00.0 Off |                    0 |
| N/A   31C    P0    54W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:1E:00.0 Off |                    0 |
| N/A   33C    P0    52W / 300W |      0MiB / 16160MiB |      0%      Default |
|       

In [2]:
# Standard Libraries
import os
import glob
import shutil
import nvidia_smi

# External Dependencies
import cupy as cp
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
import dask
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import dask.dataframe as dd
import rmm

from pathlib import Path
import pandas as pd
import numpy as np

# NVTabular
import nvtabular as nvt
import nvtabular.ops as ops
from nvtabular.io import Shuffle
from nvtabular.utils import device_mem_size

import warnings
warnings.filterwarnings('ignore')

import logging

In [3]:
from fencoding_GPUs import set_cluster_client
client = set_cluster_client(n_gpus=-1, device_spill_frac=0.8)

Dask Version: 2.30.0
Dask cuDF Version: 0.16.0


 n_gpus_avail: 2


0,1
Client  Scheduler: tcp://127.0.0.1:43809  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 404.32 GB



 Dashboard avail: http://localhost:8888/proxy/8787/status


In [4]:
# home-credit-default-risk tables
if not Path("data/application_test.csv").is_file():
    %cd data
    ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
    ! unzip -q data.zip
    ! rm data.zip
# default-of-credit-card-clients-dataset
if not Path("data/default_ucr.csv").is_file():
    %cd data
    ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv
    %cd ..

In [5]:
# Read in the source datasets
dict_ = {
    'datasets':[
                # default-of-credit-card-clients-datasets
                pd.read_csv('./data/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']
                 #['default.payment.next.month'] # 'ID' is needed for shuffling
    ],   
}

In [6]:
N_D = 0
X= dict_['datasets'][N_D]

In [7]:
class FEncoding_advanced(object):   
    def __init__(self, client, rest_col_names=[], y_names=[], filename=None):        
        self.filename = filename
        self.rest_col_names = rest_col_names
        self.y_names = y_names
        self.n_gpus = len(client.nthreads())
        self.client = client
        self.output_path="./parquet_data_tmp"
        self.categor_types = ['category', 'object', 'bool', 'int32', 'int64', 'int8']
        self.numer_types = ['float', 'float32', 'float64']
        self.time_types = ['datetime64[ns]', 'datetime64[ns, tz]'] 
        # What else? https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
        # TODO: check if there are any other time types
        
    def elim_empty_columns(self, X, save_to_csv = False):
        # GPU version
        ddf = dd.from_pandas(X, npartitions=self.n_gpus)
        cols_to_drop = []
        for column in ddf.columns:
            if len(ddf[column].unique().compute().values) < 2:
                cols_to_drop.append(column)
        print('\n dropped columns:', cols_to_drop)
        X_final = ddf.drop(cols_to_drop, axis=1).compute()  
        if filename is not None:
            if save_to_csv:
                X_final.to_csv('./data/' + self.filename, index=False)
        return X_final
    
    def initialize_types(self, X, return_dtype=False, save_to_pkl = False, dict_name = 'out_dict.pkl'):
        # GPU version
        X = dd.from_pandas(X, npartitions=2)       
        self.categor_columns, self.numer_columns, self.time_columns = [], [], []
        # Sometimes categorical feature can be presented with a float type. Let's check for that
        f_columns_names =[x for x in list(X.columns)  if x not in self.rest_col_names + self.y_names]
        for column in f_columns_names:
            c_type = str(X[column].dtype) 
            if any(c_type == t for t in self.numer_types):    
                uvs = cp.array(X[column].unique().compute())
                unique_values = list(uvs[~cp.isnan(uvs)])
                if cp.array([el.item().is_integer() for el in unique_values]).sum() == len(unique_values):
                    #print('\n {} has type {} and number of unique values: {}, will be considered as a categorical \n'.format(column, c_type, len(unique_values)))
                    #logging.info(f"{column} has type {c_type} and number of unique values: {len(unique_values)}, will be considered as a categorical")
                    self.categor_columns.append(column)
                else:
                    self.numer_columns.append(column)
            if any(c_type == t for t in self.categor_types):
                self.categor_columns.append(column)
            if any(c_type == t for t in self.time_types):
                self.time_columns.append(column)                             
        out_dict =  {'categor_columns': self.categor_columns,
                'numer_columns': self.numer_columns,
                'time_columns': self.time_columns,                    
         }
        if return_dtype:
            out_dict.update(
                {'categor_columns_dtypes': [str(X[self.categor_columns].dtypes.values[i]) for i in range(len(self.categor_columns))],
                 'numer_columns_dtypes': [str(X[self.numer_columns].dtypes.values[i]) for i in range(len(self.numer_columns))],
                 'time_columns_dtypes': [str(X[self.time_columns].dtypes.values[i]) for i in range(len(self.time_columns))],                    
             })            
        if save_to_pkl:
            output = open('./data/' + dict_name, 'wb')
            pickle.dump(out_dict, output)
            output.close()
        return out_dict
    
    def date_replace(self, X, save_to_csv = False):
        # GPU version
        return X
        
    
    def processing(self, X, 
                  outliers_detection_technique = 'iqr_proximity_rule', #'gaussian_approximation','quantiles'
                  fill_with_value = 'zeros', #'extreme_values', #'mean-median'
                  encoding_method = 'OrdinalEncoder', #'OneHotEncoder'
                  save_to_csv = False,
                  ):
        
        self.initialize_types(X,  return_dtype=False)
        dataset = nvt.Dataset(X)
        
        # Initalize our Workflow
        workflow = nvt.Workflow(cat_names=self.categor_columns, 
                        cont_names=self.numer_columns,
                        label_name=self.y_names,
                        client=self.client
                       )
        
        # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html
        # OutlDetect
        
        
        

        workflow.add_preprocess(
            #TODO: change in OutlDetect 
            ops.Clip(0, 10, columns=)#min_value=None, max_value=None, columns=f_dict['numer_columns'], replace=True)

            #TODO: change in encode_categor
            #ops.TargetEncoding(cat_groups=f_dict['categor_columns'],
             #                  cont_target=None),

            #TODO: chenge in tree-based models, nana will be filled in with max values (or zeros)
            #ops.FillMissing(fill_val=0, columns=f_dict['categor_columns'] + f_dict['numer_columns'], replace=True),
        )

        #workflow.add_preprocess(
        #    ops.Categorify(10)
        #)

        #workflow.add_preprocess(
        #    ops.FillMedian()#columns=f_dict['categor_columns'], preprocessing=True, replace=True)

        #)     
        
        #######################################################        
        workflow.finalize()
        tmp_output_path="./parquet_data_tmp"
        workflow.apply(
            dataset,
             output_format="parquet",
             output_path=tmp_output_path,
             shuffle=Shuffle.PER_WORKER,  # Shuffle algorithm
             out_files_per_proc=8, # Number of output files per worker
        )
        files = glob.glob(tmp_output_path + "/*.parquet")
        X_final = cudf.read_parquet(files[0])
        for i in range(1, len(files)):    
            X_final = X_final.append(cudf.read_parquet(files[i]))      
        
        # Delete temporary files
        shutil.rmtree(tmp_output_path, ignore_errors=True)
        shutil.rmtree('dask-worker-space', ignore_errors=True)
        
        if save_to_csv is not None: 
            X_final.to_csv('./data/' + self.filename, index=False)
        return X_final 

In [8]:
fencoding = FEncoding_advanced(client)
fencoding.rest_col_names = ['ID']
fencoding.y_names = ['default.payment.next.month']

# Ready, TODO: check for correctness 
#fencoding.elim_empty_columns(X)
#fencoding.initialize_types(X, return_dtype=True)

In [11]:
fencoding.processing(X, save_to_csv = False)

NameError: name 'f_dict' is not defined