In [1]:
!nvidia-smi

Wed Nov 18 18:42:19 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:1D:00.0 Off |                    0 |
| N/A   34C    P0    55W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:1E:00.0 Off |                    0 |
| N/A   36C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |
|       

In [2]:
# Standard Libraries
import os
import glob
import shutil
import nvidia_smi

# External Dependencies
import cupy as cp
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
import dask
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import rmm

from pathlib import Path
import pandas as pd

print('Dask Version:', dask.__version__)
print('Dask cuDF Version:', dask_cudf.__version__)
print()

# NVTabular
import nvtabular as nvt
import nvtabular.ops as ops
from nvtabular.io import Shuffle
from nvtabular.utils import device_mem_size

# Deploy a Single-Machine Multi-GPU Cluster
nvidia_smi.nvmlInit()
n_gpus_avail = nvidia_smi.nvmlDeviceGetCount()
print('n_gpus_avail: {}'.format(n_gpus_avail))

# Delect devices to place workers
visible_devices = [i for i in list(range(n_gpus_avail))]
visible_devices = str(visible_devices)[1:-1]
print('visible_devices: {}'.format(visible_devices))

cluster = LocalCUDACluster(
        protocol = "tcp", # "tcp" or "ucx"
        CUDA_VISIBLE_DEVICES = visible_devices,
        device_memory_limit = 0.8 * device_mem_size(kind="total"),
    )

# Create the distributed client
client = Client(cluster)
client

# Dashboard:
#http://localhost:8888/proxy/8787/status

Dask Version: 2.30.0
Dask cuDF Version: 0.16.0

n_gpus_avail: 2
visible_devices: 0, 1


0,1
Client  Scheduler: tcp://127.0.0.1:33081  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 404.32 GB


In [3]:
# Initialize RMM pool on ALL workers
def _rmm_pool():
    rmm.reinitialize(
        pool_allocator=True,
        initial_pool_size=None, # Use default size
    )    
client.run(_rmm_pool)

{'tcp://127.0.0.1:39953': None, 'tcp://127.0.0.1:42039': None}

In [4]:
# home-credit-default-risk tables
if not Path("data/application_test.csv").is_file():
    %cd data
    ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
    ! unzip -q data.zip
    ! rm data.zip
# default-of-credit-card-clients-dataset
if not Path("data/default_ucr.csv").is_file():
    %cd data
    ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv
    %cd ..

In [5]:
# Read in the source datasets
dict_ = {
    'datasets':[
                # default-of-credit-card-clients-datasets
                pd.read_csv('./data/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']
                 #['default.payment.next.month'] # 'ID' is needed for shuffling
    ],   
}

# Keep ID and target columns separately
dict_['dropped_columns'] = [dict_['datasets'][i][dict_['name_dropped_columns'][i]] for i in range(len(dict_['datasets']))]

# Drop ID and target columns from the tables
dict_['datasets'] = [dict_['datasets'][i].drop(dict_['name_dropped_columns'][i], axis=1) for i in range(len(dict_['datasets']))]

In [24]:
N_D = 0
X, X_rest = dict_['datasets'][N_D], dict_['dropped_columns'][N_D]

from fencoding_CPUs import FEncoding
f_dict = FEncoding().initialize_types(X, return_dtype=False)
#f_dict

In [25]:
###############################################################
output_path="./parquet_data_tmp"
dataset = nvt.Dataset('./data/default_ucr.csv', part_mem_fraction=0.1)

In [26]:
# Initalize our Workflow
workflow = nvt.Workflow(cat_names=f_dict['categor_columns'], 
                        cont_names=f_dict['numer_columns'],
                        label_name=[dict_['name_dropped_columns'][0][-1]],
                        client=client
                       )

In [27]:
# Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html

workflow.add_preprocess(
    #TODO: change in OutlDetect 
    ops.Clip(0, 10, columns=f_dict['categor_columns'])#min_value=None, max_value=None, columns=f_dict['numer_columns'], replace=True)
    
    #TODO: change in encode_categor
    #ops.TargetEncoding(cat_groups=f_dict['categor_columns'],
     #                  cont_target=None),
    
    #TODO: chenge in tree-based models, nana will be filled in with max values (or zeros)
    #ops.FillMissing(fill_val=0, columns=f_dict['categor_columns'] + f_dict['numer_columns'], replace=True),
)

workflow.add_preprocess(
    ops.Categorify(10)
)

#workflow.add_preprocess(
#    ops.FillMedian()#columns=f_dict['categor_columns'], preprocessing=True, replace=True)
          
#)

workflow.finalize()

In [28]:
workflow.apply(
    dataset,
     output_format="parquet",
     output_path=output_path,
     shuffle=Shuffle.PER_WORKER,  # Shuffle algorithm
     out_files_per_proc=4, # Number of output files per worker
)

In [29]:
files = glob.glob(output_path + "/*.parquet")
X_final = cudf.read_parquet(files[0])
for i in range(1, len(files)):    
    X_final = X_final.append(cudf.read_parquet(files[i]))

X_final.to_csv('./data/default_ucr_processed.csv', index=False)
shutil.rmtree(output_path, ignore_errors=True)
shutil.rmtree('dask-worker-space', ignore_errors=True)

In [30]:
pd.read_csv('./data/default_ucr_processed.csv')

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0,2,4,2,0,3,3,3,3,3,...,0,0,0,7,8,0,9,0,0,0
1,0,2,3,3,0,3,3,3,3,3,...,0,0,0,7,8,0,9,0,1,0
2,0,2,4,2,0,3,3,3,3,3,...,0,0,0,7,8,0,9,0,0,0
3,0,2,3,2,0,3,3,3,3,3,...,0,0,0,7,8,0,9,1,0,0
4,0,2,3,2,0,3,3,3,3,3,...,0,0,0,1,8,0,9,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,2,2,3,0,3,3,3,3,3,...,0,0,0,7,8,0,9,0,1,0
29996,0,2,3,2,0,5,5,6,6,6,...,0,0,0,1,1,1,1,1,1,1
29997,0,2,2,3,0,4,5,4,4,3,...,0,0,0,7,8,1,9,0,0,1
29998,0,2,2,3,0,3,3,3,3,3,...,0,0,0,7,8,0,9,0,0,0
