In [1]:
!nvidia-smi

Fri Nov 27 19:08:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:1A:00.0 Off |                    0 |
| N/A   33C    P0    68W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:1C:00.0 Off |                    0 |
| N/A   30C    P0    62W / 300W |      0MiB / 16160MiB |      0%      Default |
|       

In [2]:
# Standard Libraries
import os
import glob
import shutil
import nvidia_smi
import pickle

# External Dependencies
import cupy as cp
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
import dask
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import dask.dataframe as dd
import rmm
 
import dask_ml
from dask_ml.preprocessing import OneHotEncoder
import dask.array as da

from pathlib import Path
import pandas as pd
import numpy as np

# NVTabular
import nvtabular as nvt
import nvtabular.ops as ops
from nvtabular.io import Shuffle
from nvtabular.utils import device_mem_size

import warnings
warnings.filterwarnings('ignore')

import logging

In [3]:
from fencoding_GPUs import set_cluster_client
client = set_cluster_client(n_gpus=-1, device_spill_frac=0.8)

Dask Version: 2.30.0
Dask cuDF Version: 0.16.0


 n_gpus_avail: 2


0,1
Client  Scheduler: tcp://127.0.0.1:37447  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 404.32 GB



 Dashboard avail: http://localhost:8888/proxy/8787/status


In [4]:
# home-credit-default-risk tables
if not os.path.isfile("data/application_test.csv"):
    %cd data
    ! wget https://www.dropbox.com/s/j9xwcj9ixki5t2l/home-credit-default-risk.zip?dl=0 -O data.zip
    ! unzip -q data.zip
    ! rm data.zip
# default-of-credit-card-clients-dataset
if not os.path.isfile("data/default_ucr.csv"):
    %cd data
    ! wget https://www.dropbox.com/s/lj0d7qez18ea7dx/UCI_Credit_Card.csv?dl=0 -O default_ucr.csv
    %cd ..

In [5]:
# Read in the source datasets
dict_ = {
    'datasets':[
                # default-of-credit-card-clients-datasets
                pd.read_csv('./data/default_ucr.csv'),
    ],

    'name_dropped_columns':
                [
                 # default-of-credit-card-clients-datasets
                 ['ID', 'default.payment.next.month']
                 #['default.payment.next.month'] # 'ID' is needed for shuffling
    ],   
}

In [6]:
N_D = 0
X= dict_['datasets'][N_D]
#X

In [7]:
from fencoding_GPUs import FEncoding_advanced

In [8]:
fencoding = FEncoding_advanced(client)
fencoding.rest_col_names = ['ID']
fencoding.y_names = ['default.payment.next.month']

# Ready, TODO: check for correctness 
fencoding.elim_empty_columns(X)
fencoding.initialize_types(X, return_dtype=False)


 dropped columns: []


{'categor_columns': ['LIMIT_BAL',
  'SEX',
  'EDUCATION',
  'MARRIAGE',
  'AGE',
  'PAY_0',
  'PAY_2',
  'PAY_3',
  'PAY_4',
  'PAY_5',
  'PAY_6',
  'BILL_AMT1',
  'BILL_AMT2',
  'BILL_AMT3',
  'BILL_AMT4',
  'BILL_AMT5',
  'BILL_AMT6',
  'PAY_AMT1',
  'PAY_AMT2',
  'PAY_AMT3',
  'PAY_AMT4',
  'PAY_AMT5',
  'PAY_AMT6'],
 'numer_columns': [],
 'time_columns': []}

In [9]:
fencoding.processing(X, encode_categor_type = 'categorify', fill_with_value = 'extreme_values', 
                     save_to_csv = False)


 extrim_values: {'LIMIT_BAL': 1000000.0, 'SEX': 1, 'EDUCATION': 0, 'MARRIAGE': 0, 'AGE': 79, 'PAY_0': 7, 'PAY_2': 8, 'PAY_3': 8, 'PAY_4': 1, 'PAY_5': 8, 'PAY_6': 8, 'BILL_AMT1': -165580.0, 'BILL_AMT2': -69777.0, 'BILL_AMT3': -157264.0, 'BILL_AMT4': -170000.0, 'BILL_AMT5': -81334.0, 'BILL_AMT6': -339603.0, 'PAY_AMT1': 5361.0, 'PAY_AMT2': 1684259.0, 'PAY_AMT3': 896040.0, 'PAY_AMT4': 621000.0, 'PAY_AMT5': 4683.0, 'PAY_AMT6': 4731.0}


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,6,2,4,2,10,2,2,2,2,2,...,569,2053,1831,1159,1401,274,2278,1891,2770,0
1,62,2,2,3,16,1,1,1,1,1,...,5314,4388,11791,397,2464,5648,4793,6381,6529,0
2,1,1,3,3,1,3,3,3,3,3,...,4797,4902,4955,1194,992,886,191,1882,1,0
3,52,2,3,2,23,3,3,3,2,2,...,5597,5390,5557,6569,6414,5787,5279,5350,5281,0
4,18,2,3,2,25,3,3,3,3,3,...,19870,18481,18299,5023,4420,4572,3612,3577,3577,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1746,8,2,3,2,27,3,3,3,3,3,...,14190,5506,5577,2618,5455,2183,5335,399,385,0
1747,30,1,2,3,16,3,3,3,3,3,...,18745,17575,14184,5839,5358,3219,2534,1796,1798,0
1748,6,2,2,3,3,3,3,3,3,3,...,9103,9232,9230,1109,1076,1268,826,950,890,0
1749,24,2,2,3,13,3,2,2,2,2,...,122,3685,16391,6212,7530,1,4656,6712,3577,0


In [None]:
# Below I am fixing the problem with dask-based one-hot encoding & and parallelization with time

In [11]:
import pandas as pd, numpy as np 
import dask.array as da, dask.dataframe as dd

c1 = da.from_array(np.arange(100000, 190000), chunks=1000)
c2 = da.from_array(np.arange(200000, 290000), chunks=1000)
c3 = da.from_array(np.arange(300000, 390000), chunks=1000)

# generate dask dataframe
ddf = dd.concat([dd.from_dask_array(c) for c in [c1,c2,c3]], axis = 1) 
# name columns
ddf.columns = ['c1', 'c2', 'c3']

In [12]:
import dask.array as da, dask.dataframe as dd

In [13]:
c1

Unnamed: 0,Array,Chunk
Bytes,720.00 kB,8.00 kB
Shape,"(90000,)","(1000,)"
Count,90 Tasks,90 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 720.00 kB 8.00 kB Shape (90000,) (1000,) Count 90 Tasks 90 Chunks Type int64 numpy.ndarray",90000  1,

Unnamed: 0,Array,Chunk
Bytes,720.00 kB,8.00 kB
Shape,"(90000,)","(1000,)"
Count,90 Tasks,90 Chunks
Type,int64,numpy.ndarray


In [14]:
ddf['c1'].unique().compute()

0        100000
1        100001
2        100002
3        100003
4        100004
          ...  
89995    189995
89996    189996
89997    189997
89998    189998
89999    189999
Name: c1, Length: 90000, dtype: int64

In [15]:
X= dict_['datasets'][N_D]
X = X.iloc[:5,:5]
print(X.shape)
X = dd.from_pandas(X, 2)

(5, 5)


In [276]:
ohe=OneHotEncoder()
X_cat= ohe.fit_transform(X[['SEX']].to_dask_array(lengths=True).reshape(-1,1))
X_cat = X_cat.rechunk()
X_cat

Unnamed: 0,Array,Chunk
Shape,"(5, 2)","(5, 2)"
Count,13 Tasks,1 Chunks
Type,float64,scipy.csr_matrix
"Array Chunk Shape (5, 2) (5, 2) Count 13 Tasks 1 Chunks Type float64 scipy.csr_matrix",2  5,

Unnamed: 0,Array,Chunk
Shape,"(5, 2)","(5, 2)"
Count,13 Tasks,1 Chunks
Type,float64,scipy.csr_matrix


In [277]:
X_cat

Unnamed: 0,Array,Chunk
Shape,"(5, 2)","(5, 2)"
Count,13 Tasks,1 Chunks
Type,float64,scipy.csr_matrix
"Array Chunk Shape (5, 2) (5, 2) Count 13 Tasks 1 Chunks Type float64 scipy.csr_matrix",2  5,

Unnamed: 0,Array,Chunk
Shape,"(5, 2)","(5, 2)"
Count,13 Tasks,1 Chunks
Type,float64,scipy.csr_matrix


In [283]:
X_cat.compute()

<5x2 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [282]:
dd.from_dask_array(X_cat)[0].compute()

ValueError: Shape of passed values is (5, 1), indices imply (5, 2)

In [256]:
ohe.categories_

[array([1, 2])]

In [252]:
dd.io.from_dask_array(X_cat).compute()

ValueError: Shape of passed values is (5, 1), indices imply (5, 2)

In [259]:
dd.concat([X, X_cat], axis = 1)

TypeError: cannot concatenate object of type '<class 'scipy.sparse.csr.csr_matrix'>'; only Series and DataFrame objs are valid

In [249]:
X_cat[0].unique()

AttributeError: 'Array' object has no attribute 'unique'

In [239]:
X_cat.compute()

ValueError: Shape of passed values is (5, 1), indices imply (5, 2)

In [198]:
XX = dd.concat([X, X_cat])#, ignore_unknown_divisions=True)
XX

Unnamed: 0_level_0,0,1,EDUCATION,ID,LIMIT_BAL
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,float64,float64,float64,float64,float64
,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [199]:
XX.compute()

ValueError: Shape of passed values is (4, 1), indices imply (4, 2)

In [188]:
type(X)

dask.dataframe.core.DataFrame

In [189]:
X['EDUCATION'].compute()

ValueError: Shape of passed values is (6, 1), indices imply (6, 2)

In [149]:
X[0].ipynb_checkpoints/unique().compute()

ValueError: Shape of passed values is (6, 1), indices imply (6, 2)

In [118]:
fencoding.initialize_types(X, return_dtype=False)

2
0 float64
X[column].unique(): Dask Series Structure:
npartitions=1
    float64
        ...
Name: 0, dtype: float64
Dask Name: unique-agg, 45 tasks


ValueError: Shape of passed values is (6, 1), indices imply (6, 2)

In [115]:
X[column].dtype

dask.dataframe.core.DataFrame

In [None]:
if type(X) == pd.core.frame.DataFrame:
    X = dd.from_pandas(X, npartitions=self.n_gpus)       
self.categor_columns, self.numer_columns, self.time_columns = [], [], []
# Sometimes categorical feature can be presented with a float type. Let's check for that
f_columns_names =[x for x in list(X.columns)  if x not in self.rest_col_names + self.y_names]
print(2)
for column in f_columns_names:
    c_type = str(X[column].dtype) 
    print(column, c_type)
    if any(c_type == t for t in self.numer_types):
        print('X[column].unique():',X[column].unique())
        print('X[column].unique().compute():',X[column].unique().compute())
        print('/////')
        print('X[column].unique().compute():', X[column].unique().compute())
        uvs = cp.array(X[column].unique().compute())
        unique_values = list(uvs[~cp.isnan(uvs)])
        print(3)

In [None]:
def date_replace_(self, X):
    for column in X.columns:
        x = pars_date(X[column])
        try: 
            x.nunique()
            X[column] = x
        except AttributeError:
            pass
    return X

In [76]:
X = pd.DataFrame([
'Jan 19, 1990',
'January 19, 1990',
'Jan 19,1990',
'01/19/1990',
'01/19/90',
'1990',
'Jan 1990',
'01.02.2000',
'2000.02.01',
'01-02-2000',
'2111-01-01 12:48:20',
'123',
'abs 123', 
1339521878.04,
'1339521878.04'
]).T
X = X.append(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,"Jan 19, 1990","January 19, 1990","Jan 19,1990",01/19/1990,01/19/90,1990,Jan 1990,01.02.2000,2000.02.01,01-02-2000,2111-01-01 12:48:20,123,abs 123,1339520000.0,1339521878.04
0,"Jan 19, 1990","January 19, 1990","Jan 19,1990",01/19/1990,01/19/90,1990,Jan 1990,01.02.2000,2000.02.01,01-02-2000,2111-01-01 12:48:20,123,abs 123,1339520000.0,1339521878.04


In [77]:
import datetime as dt

In [72]:
def pars_date(x):
    fmts = ('%Y', '%b %d, %Y','%b %d, %Y','%B %d, %Y','%B %d %Y','%m/%d/%Y','%m/%d/%y','%b %Y','%B%Y','%b %d,%Y', 
              '%d.%m.%Y', '%Y.%m.%d', '%d-%m-%Y', '%Y-%m-%d %H:%M:%S')
    t = True
    if str(x.dtype) == 'object':
        for fmt in fmts:
            try:
                return pd.Series([dt.datetime.strptime(str(x.iloc[i]), fmt) for i in range(len(x))]).apply(lambda q: q.strftime('%m/%d/%Y')).astype('datetime64[ns]')
                t = False
                break 
            except ValueError:
                pass
    if t and (len(str(x.iloc[0])) > 9) and (len(str(x.iloc[0])) <= 14): 
    # TODO: better condition on string to identify that it is unix timestep
        try:
            x = x.astype('float')
            return pd.Series([dt.datetime.fromtimestamp(x.iloc[i]) for i in range(len(x))]).apply(lambda q: q.strftime('%m/%d/%Y')).astype('datetime64[ns]')
        except ValueError:
            pass

In [73]:
    fmts = ('%Y', '%b %d, %Y','%b %d, %Y','%B %d, %Y','%B %d %Y','%m/%d/%Y','%m/%d/%y','%b %Y','%B%Y','%b %d,%Y', 
              '%d.%m.%Y', '%Y.%m.%d', '%d-%m-%Y', '%Y-%m-%d %H:%M:%S')

In [74]:
x = ddX[0]
x

Dask Series Structure:
npartitions=1
0    object
0       ...
Name: 0, dtype: object
Dask Name: getitem, 2 tasks

In [75]:
x[0]

NotImplementedError: Series getitem in only supported for other series objects with matching partition structure

In [394]:
x = x.compute()

In [402]:
for fmt in fmts:
    try:
        print(dt.datetime.strptime(str(x.iloc[0]), fmt))
    except ValueError:
        pass

1990-01-19 00:00:00
1990-01-19 00:00:00


In [401]:
a

[datetime.datetime(1990, 1, 19, 0, 0), datetime.datetime(1990, 1, 19, 0, 0)]

In [None]:
pd.Series([dt.datetime.strptime(str(x.iloc[i]), fmt) for i in range(len(x))]).apply(lambda q: q.strftime('%m/%d/%Y')).astype('datetime64[ns]')


In [387]:
pars_date(X[0])

AttributeError: 'Series' object has no attribute 'iloc'

In [45]:
X = pd.DataFrame(
    [
        [1,2.2,5.4, 0, 'a','wd'],
        [0.2, 5,8, 9.2, 'a', 'ddd'],
        [-3,4,6,2,'q', 'a'],
        [-3,4,5,6,'w', 'a'],
        [None,None,5,6,'w', 'wd']
    
    ]
)
X.columns = ['a', '1', '2', '3', '4', '5']
X

Unnamed: 0,a,1,2,3,4,5
0,1.0,2.2,5.4,0.0,a,wd
1,0.2,5.0,8.0,9.2,a,ddd
2,-3.0,4.0,6.0,2.0,q,a
3,-3.0,4.0,5.0,6.0,w,a
4,,,5.0,6.0,w,wd


In [46]:
dataset = dd.from_pandas(X, npartitions=2) 
dataset

Unnamed: 0_level_0,a,1,2,3,4,5
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,float64,float64,float64,float64,object,object
3,...,...,...,...,...,...
4,...,...,...,...,...,...


In [52]:
dataset[['1']].append(dataset[['2']])

Unnamed: 0_level_0,1,2
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1
,float64,float64
,...,...
...,...,...
,...,...
,...,...


In [50]:
dataset = dataset.append(dataset[['1']])
dataset

Unnamed: 0_level_0,a,1,2,3,4,5
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,float64,float64,float64,float64,object,object
,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...


In [102]:
new = OneHotEncoder().fit_transform(dataset[['4','5']].to_dask_array(lengths=True))
new

Unnamed: 0,Array,Chunk
Bytes,240 B,72 B
Shape,"(5, 6)","(3, 3)"
Count,22 Tasks,4 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 240 B 72 B Shape (5, 6) (3, 3) Count 22 Tasks 4 Chunks Type float64 numpy.ndarray",6  5,

Unnamed: 0,Array,Chunk
Bytes,240 B,72 B
Shape,"(5, 6)","(3, 3)"
Count,22 Tasks,4 Chunks
Type,float64,numpy.ndarray


In [None]:
new

In [103]:
dataset = dataset.drop(['4','5'], axis=1)

In [104]:
dataset

Unnamed: 0_level_0,a,1,2,3
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,float64,float64,float64,float64
3,...,...,...,...
4,...,...,...,...


In [108]:
dd.from_array(new)

Unnamed: 0_level_0,0,1,2,3,4,5
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,float64,float64,float64,float64,float64,float64
3,...,...,...,...,...,...
4,...,...,...,...,...,...


In [109]:
dataset.append(dd.from_array(new))

Unnamed: 0_level_0,0,1,2,3,4,5,1,2,3,a
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [None]:

    print(OneHotEncoder().fit_transform(X[self.categor_columns].to_dask_array(lengths=True)))
    #lengths=True - chunk sizes can be computed
    X = X.drop(self.categor_columns, axis=1)
    X

In [78]:
dataset = dd.from_pandas(X, npartitions=2) 
dataset

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object
0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [80]:
dataset = dataset.drop([0,2], axis=1).compute()

In [81]:
dataset

Unnamed: 0,1,3,4,5,6,7,8,9,10,11,12,13,14
0,"January 19, 1990",01/19/1990,01/19/90,1990,Jan 1990,01.02.2000,2000.02.01,01-02-2000,2111-01-01 12:48:20,123,abs 123,1339520000.0,1339521878.04
0,"January 19, 1990",01/19/1990,01/19/90,1990,Jan 1990,01.02.2000,2000.02.01,01-02-2000,2111-01-01 12:48:20,123,abs 123,1339520000.0,1339521878.04
