In [14]:
import pandas as pd
import numpy as np
import gc
import tqdm

from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq

In [16]:
TRAIN_PATH = '../input/amex-default-prediction/train_data.csv'
TRAIN_LAB_PATH = '../input/amex-default-prediction/train_labels.csv'

# **Read File**

In [17]:
df = pd.read_csv(TRAIN_PATH, chunksize=1)
cat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_vars = list(set(df.__next__().columns).difference(set(cat)))
cont_vars.remove('customer_ID')
cont_vars.remove('S_2')

In [18]:
def floorify(x, lo):
    return lo if x <= lo+0.01 and x >= lo else x

In [19]:
def floorify_zeros(x):
    has_zeros = len([t for t in x if t>=0 and t<=0.01])>0 
    no_proximity = len([t for t in x if t<0 and t>=-0.01])==0 and len([t for t in x if t>0.01 and t<=0.02])==0
    if not no_proximity:
        return x
    if not has_zeros:
        return x
    x = [floorify(t, 0.0) for t in x]
    return x

In [20]:
def floorify_ones(x):
    has_ones = len([t for t in x if t>=1 and t<=1.01])>0 
    no_proximity = len([t for t in x if t<1 and t>=0.99])==0 and len([t for t in x if t>1.01 and t<=1.02])==0
    if not no_proximity:
        return x
    if not has_ones:
        return x
    x = [floorify(t, 1.0) for t in x]
    return x

In [21]:
def convert_na(x):
    if np.nanmin(x)>=0:
        return [-1 if np.isnan(t) else t for t in x]

In [22]:
def convert_to_int(x):
    q = convert_na(x)
    if set(np.unique(q)).union({-1,0,1}) == {-1,0,1}:
        return [np.int8(t) for t in q]
    return x

In [23]:
def floorify_frac(x, interval=1):
    xt = (np.floor(x/interval+1e-6)).fillna(-1)
    if np.max(xt)<=127:
        return xt.astype(np.int8)
    return xt.astype(np.int16) 

In [24]:
def floorify_ones_and_zeros(t):
    t = floorify_zeros(t)
    t = floorify_ones(t)
    t = convert_to_int(t)
    return t

In [25]:
def convert_train_parquet(input_path, output_path, chunksize = 15000):
   
    train_label_tmp = pd.read_csv(TRAIN_LAB_PATH)
    pq_writer = None
    
    for idx, df_chunk in enumerate(pd.read_csv(input_path, chunksize=chunksize)):
        print(f"id: {idx} Chunk size {df_chunk.shape}")
        
        df_chunk[cont_vars] = df_chunk[cont_vars].astype('float32')
        
        # merge a new column 'target'
        df_chunk = df_chunk.merge(train_label_tmp, left_on='customer_ID', right_on='customer_ID')
        df_chunk.customer_ID = df_chunk.customer_ID.apply(lambda x: int(x[-16:],16)).astype('int64')

        #Convert catwgorical data to int.
        df_chunk['B_30'] = convert_to_int(floorify_frac(df_chunk['B_30']))
        df_chunk['B_38'] = convert_to_int(floorify_frac(df_chunk['B_38']))
        df_chunk['D_66'] = convert_to_int(floorify_frac(df_chunk['D_66']))
        df_chunk['D_68'] = convert_to_int(floorify_frac(df_chunk['D_68']))
        df_chunk['D_114'] = convert_to_int(floorify_frac(df_chunk['D_114']))
        df_chunk['D_116'] = convert_to_int(floorify_frac(df_chunk['D_116']))
        df_chunk['D_117'] = convert_to_int(floorify_frac(df_chunk['D_117'])+1)
        df_chunk['D_120'] = convert_to_int(floorify_frac(df_chunk['D_120']))
        df_chunk['D_126'] = convert_to_int(floorify_frac(df_chunk['D_126'])+1)
        df_chunk['D_63'] = df_chunk['D_63'].apply(lambda t: {'CR':0, 'XZ':1, 'XM':2, 'CO':3, 'CL':4, 'XL':5}[t]).astype(np.int8)
        df_chunk['D_64'] = df_chunk['D_64'].apply(lambda t: {np.nan:-1, 'O':0, '-1':1, 'R':2, 'U':3}[t]).astype(np.int8)
        
        df_chunk['S_2'] = pd.to_datetime(df_chunk['S_2'])
        
        
        
        table = pa.Table.from_pandas(df_chunk)
        if idx == 0:
            pq_writer = pq.ParquetWriter(output_path, table.schema, compression = 'snappy')
        
        pq_writer.write_table(table)
        

        # Removing current chunk from meory to free up memory
        del df_chunk
        del table
        gc.collect()
    
    if pq_writer:
        pq_writer.close()
    gc.collect()

In [26]:
convert_train_parquet(input_path=  TRAIN_PATH, output_path='./train.parquet')

In [None]:
train_label_df = pd.read_csv(TRAIN_LAB_PATH)
train_label_df.customer_ID = train_label_df.customer_ID.apply(lambda x: int(x[-16:],16)).astype('int64')
train_label_df.to_parquet('./train_label.parquet')