In [1]:
import warnings
warnings.simplefilter('ignore')

import dask
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os,gc

from tqdm import tqdm
from dotenv import dotenv_values
from dask_ml.decomposition import PCA
from dask_ml.preprocessing import StandardScaler

In [2]:
config = dotenv_values("../.env")

In [3]:
cat_ddf = dd.read_parquet(config["WRANGLED_DATA"] + "train_cat_feature.parquet", chunksize=512)
num_ddf = dd.read_parquet(config["WRANGLED_DATA"] + "train_num_feature.parquet", chunksize=512)
diff_ddf = dd.read_parquet(config["WRANGLED_DATA"] + "train_diff_feature.parquet", chunksize=512)
train_ddf = cat_ddf.merge(num_ddf, on="customer_ID").merge(diff_ddf, on="customer_ID").fillna(0)
train_ddf.shape[1]

2332

In [4]:
cat_ddf = dd.read_parquet(config["WRANGLED_DATA"] + "test_cat_feature.parquet", chunksize=1024)
num_ddf = dd.read_parquet(config["WRANGLED_DATA"] + "test_num_feature.parquet", chunksize=1024)
diff_ddf = dd.read_parquet(config["WRANGLED_DATA"] + "test_diff_feature.parquet", chunksize=1024)
test_ddf = cat_ddf.merge(num_ddf, on="customer_ID").merge(diff_ddf, on="customer_ID").fillna(0)
test_ddf.shape[1]

2320

In [5]:
res = [x for x in list(train_ddf.columns) + list(test_ddf.columns) 
       if x not in train_ddf.columns or x not in test_ddf.columns]
res

['oneHot_D_64_1_mean',
 'oneHot_D_64_1_std',
 'oneHot_D_64_1_sum',
 'oneHot_D_64_1_last',
 'oneHot_D_66_0.0_mean',
 'oneHot_D_66_0.0_std',
 'oneHot_D_66_0.0_sum',
 'oneHot_D_66_0.0_last',
 'oneHot_D_68_0.0_mean',
 'oneHot_D_68_0.0_std',
 'oneHot_D_68_0.0_sum',
 'oneHot_D_68_0.0_last']

In [6]:
train_ddf = train_ddf.drop(res, axis=1)
len(train_ddf.columns)

2320

In [7]:
all_cols = [x for x in list(train_ddf.columns) if x not in ["customer_ID", "target"]]
len(all_cols)

2319

In [8]:
del cat_ddf, num_ddf, diff_ddf
gc.collect()

0

In [9]:
scaler = StandardScaler()
scaler

In [10]:
train_ddf = scaler.fit_transform(train_ddf[all_cols])
train_ddf.to_csv(config["WRANGLED_DATA"] + "scaled_train", name_function=lambda x: f"train-{x}.csv", index=False)

['D:\\datasets\\amex-default-prediction\\wrangled_data\\scaled_train\\train-0.csv.part']

In [11]:
pca = PCA(n_components=128, svd_solver='randomized', whiten=True)
pca.fit(train_ddf.to_dask_array(lengths=True))

In [12]:
train_ddf = pca.transform(train_ddf.to_dask_array(lengths=True)).to_dask_dataframe()
train_ddf.to_csv(config["WRANGLED_DATA"] + "train_pca", name_function=lambda x: f"train-pca-{x}.csv", index=False)

['D:\\datasets\\amex-default-prediction\\wrangled_data\\train_pca\\train-pca-0.csv.part']

In [13]:
test_ddf = scaler.transform(test_ddf[all_cols])
test_ddf.to_csv(config["WRANGLED_DATA"] + "scaled_test", name_function=lambda x: f"test-{x}.csv", index=False)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Kaleb\anaconda3\envs\kaggle\lib\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Kaleb\AppData\Local\Temp\ipykernel_23620\3682337358.py", line 2, in <cell line: 2>
    test_ddf.to_csv(config["WRANGLED_DATA"] + "scaled_test", name_function=lambda x: f"test-{x}.csv", index=False)
  File "C:\Users\Kaleb\AppData\Roaming\Python\Python310\site-packages\dask\dataframe\core.py", line 1691, in to_csv
    return to_csv(self, filename, **kwargs)
  File "C:\Users\Kaleb\AppData\Roaming\Python\Python310\site-packages\dask\dataframe\io\csv.py", line 972, in to_csv
    return list(dask.compute(*values, **compute_kwargs))
  File "C:\Users\Kaleb\AppData\Roaming\Python\Python310\site-packages\dask\base.py", line 603, in compute
    results = schedule(dsk, keys, **kwargs)
  File "C:\Users\Kaleb\AppData\Roaming\Python\Python310\site-packages\dask\threaded.py", li

In [None]:
test_ddf = pca.transform(test_ddf.to_dask_array(lengths=True)).to_dask_dataframe()
test_ddf.to_csv(config["WRANGLED_DATA"] + "test_pca", name_function=lambda x: f"test-pca-{x}.csv", index=False)