In [1]:
# Standard Libraries
import os
import glob
import shutil
import nvidia_smi
import pickle

# External Dependencies
import cupy as cp
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
import dask
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import dask.dataframe as dd
import dask.array as da
import rmm
 
import dask_ml
from dask_ml.preprocessing import OneHotEncoder

from pathlib import Path
import pandas as pd
import numpy as np

# NVTabular
import nvtabular as nvt
import nvtabular.ops as ops
from nvtabular.io import Shuffle
from nvtabular.utils import device_mem_size

import logging
import warnings
warnings.filterwarnings('ignore')

In [2]:
from fencoding_GPUs import set_cluster_client
client = set_cluster_client(n_gpus=-1, device_spill_frac=0.8)

Dask Version: 2.30.0
Dask cuDF Version: 0.16.0


 n_gpus_avail: 2


0,1
Client  Scheduler: tcp://127.0.0.1:43277  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 404.32 GB



 Dashboard avail: http://localhost:8888/proxy/8787/status


In [3]:
# Read in the source datasets
dict = {
    'datasets':[
                # home-credit-default-risk tables
                pd.read_csv('./data/application_train.csv'),
                # default-of-credit-card-clients-datasets
                pd.read_csv('./data/default_ucr.csv'),
    ],
    'name_dropped_columns':
                [
                 # home-credit-default-risk tables
                 ['SK_ID_CURR', 'TARGET'],
                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']              
    ],}

In [4]:
from fencoding_CPUs import reduce_mem_usage

In [5]:
X = dict['datasets'][0]
X = reduce_mem_usage(X)


Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Memory usage decreased by 67.7%


In [6]:
X = X.iloc[:,:10]
X

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5
...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0


In [7]:
from fencoding_GPUs import FEncoding_advanced

In [8]:
fencoding = FEncoding_advanced(client, rest_col_names=['TARGET', 'SK_ID_CURR'])

In [9]:
X = fencoding.elim_empty_columns(X, file_path=None)#'test_out.csv')


 columns to drop: {}


In [10]:
fencoding.initialize_types(X,
                           n_unique_val_th=50,
                           categor_columns_keep=[],
                           numer_columns_keep=['CNT_CHILDREN'],
                           return_dtype=True, 
                           file_name='out_dict.pkl')

{'categor_columns': ['NAME_CONTRACT_TYPE',
  'CODE_GENDER',
  'FLAG_OWN_CAR',
  'FLAG_OWN_REALTY',
  'CNT_CHILDREN'],
 'numer_columns': ['AMT_INCOME_TOTAL',
  'AMT_CREDIT',
  'AMT_ANNUITY',
  'CNT_CHILDREN'],
 'time_columns': [],
 'categor_columns_dtypes': ['object', 'object', 'object', 'object'],
 'numer_columns_dtypes': ['float32', 'int8', 'float32', 'float32'],
 'time_columns_dtypes': []}

In [11]:
fencoding.processing(X, y_names=['TARGET'], 
                     encode_categor_type = None,#'categorify',
                     outliers_detection_technique = None,#'iqr_proximity_rule',
                     fill_with_value = 'extreme_values', 
                     file_path=None)

Unnamed: 0,AMT_ANNUITY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,FLAG_OWN_REALTY,CODE_GENDER,FLAG_OWN_CAR,NAME_CONTRACT_TYPE,TARGET
0,68512.5,0,135000.0,1971072.0,N,F,N,Cash loans,0
1,15916.5,0,90000.0,540000.0,Y,F,N,Cash loans,0
2,22365.0,1,180000.0,765000.0,N,F,Y,Cash loans,0
3,26145.0,0,135000.0,787131.0,Y,F,N,Cash loans,0
4,47808.0,0,207000.0,521280.0,N,M,Y,Cash loans,0
...,...,...,...,...,...,...,...,...,...
153750,8752.5,0,315000.0,284400.0,N,F,N,Cash loans,0
153751,30037.5,0,270000.0,678996.0,Y,F,Y,Cash loans,0
153752,20979.0,0,157500.0,408330.0,Y,M,N,Cash loans,0
153753,29376.0,1,121500.0,539100.0,Y,F,Y,Cash loans,1
