In [1]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from dotenv import dotenv_values

import dask
import dask.dataframe as dd

In [2]:
config = dotenv_values("../.env")

In [3]:
def denoise(df):
    df['D_63'] = df['D_63'].apply(lambda t: {'CR':0, 'XZ':1, 'XM':2, 'CO':3, 'CL':4, 'XL':5}[t],
                                  meta=pd.DataFrame).astype(np.int8)
    df['D_64'] = df['D_64'].apply(lambda t: {np.nan:-1, 'O':0, '-1':1, 'R':2, 'U':3}[t],
                                  meta=pd.DataFrame).astype(np.int8)
    for col in tqdm(df.columns):
        if col not in ['customer_ID','S_2','D_63','D_64']:
            df[col] = np.floor(df[col]*100).astype(np.float32)
    return df

In [4]:
train = dd.read_csv(config["TRAIN_PATH"],blocksize=32e6)
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


In [5]:
train = denoise(train)
train.head()

100%|████████████████████████████████████████████████████████████████████████████████| 190/190 [00:05<00:00, 36.74it/s]


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,93.0,0.0,0.0,100.0,0.0,12.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,93.0,0.0,0.0,100.0,0.0,12.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,95.0,9.0,2.0,100.0,0.0,12.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,96.0,0.0,1.0,100.0,0.0,11.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,94.0,0.0,1.0,100.0,0.0,11.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0


In [6]:
train.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 190 entries, customer_ID to D_145
dtypes: object(2), float32(186), int8(2)

In [7]:
train.to_parquet(config["WRANGLED_DATA"] + "denoised_train/", 
             name_function=lambda x : f"train-{x}.parquet")

In [8]:
del train

In [9]:
test = dd.read_csv(config["TEST_PATH"],blocksize=32e6)
test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631315,0.001912,0.010728,0.814497,0.007547,0.168651,0.009971,0.002347,...,,,,,0.004669,,,,0.008281,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.587042,0.005275,0.011026,0.810848,0.001817,0.241389,0.000166,0.009132,...,,,,0.000142,0.00494,0.009021,,0.003695,0.003753,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.609056,0.003326,0.01639,1.00462,0.000114,0.266976,0.004196,0.004192,...,,,,7.4e-05,0.002114,0.004656,,0.003155,0.002156,0.006482
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614911,0.009065,0.021672,0.816549,0.009722,0.188947,0.004123,0.015325,...,,,,0.004743,0.006392,0.00289,,0.006044,0.005206,0.007855
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591673,0.238794,0.015923,0.810456,0.002026,0.180035,0.000731,0.011281,...,,,,0.008133,0.004329,0.008384,,0.001008,0.007421,0.009471


In [10]:
test = denoise(test)
test.head()

100%|████████████████████████████████████████████████████████████████████████████████| 190/190 [00:05<00:00, 33.09it/s]


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,63.0,0.0,1.0,81.0,0.0,16.0,0.0,0.0,...,,,,,0.0,,,,0.0,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,58.0,0.0,1.0,81.0,0.0,24.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,60.0,0.0,1.0,100.0,0.0,26.0,0.0,0.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,61.0,0.0,2.0,81.0,0.0,18.0,0.0,1.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,59.0,23.0,1.0,81.0,0.0,18.0,0.0,1.0,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0


In [11]:
test.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 190 entries, customer_ID to D_145
dtypes: object(2), float32(186), int8(2)

In [12]:
test.to_parquet(config["WRANGLED_DATA"] + "denoised_test/", 
             name_function=lambda x : f"test-{x}.parquet")

## train_data.csv
__16gb -> 512mb__

## test_data.csv
__33gb -> 1gb__