# data-utils tests
---

Experimenting with data-utils methods to check if everything is working properly.

## Importing the necessary packages

In [None]:
import dask.dataframe as dd                # Dask to handle big data in dataframes
import pandas as pd                        # Pandas to load the data initially
from dask.distributed import Client        # Dask scheduler
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
import torch                               # PyTorch to create and apply deep learning models
import data_utils as du                    # Data science and machine learning relevant methods

In [None]:
du.random_seed

In [None]:
du.set_random_seed(42)

In [None]:
du.random_seed

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir('../..')

# Path to the CSV dataset files
data_path = 'Datasets/Others/digidb/'

In [None]:
# Set up local cluster
client = Client()
client

In [None]:
client.run(os.getcwd)

## Loading data

Opting on testing with a [Digimon dataset from Kaggle](https://www.kaggle.com/rtatman/digidb/data#), for nostalgia purposes 🙂

In [None]:
df = dd.read_csv(f'{data_path}DigiDB_digimonlist.csv', dtype={'Digimon': str, 'Stage': str, 'Type': str, 'Attribute': str})
df.head()

Also creating artifical data for other tests:

In [None]:
data_df = pd.DataFrame([[103, 0, '1;2'],
                        [103, 1, 3],
                        [104, 0, 4],
                        [105, 0, '3;1;2;4'],
                        [105, 1, '4;1;3']], columns=['id', 'ts', 'Var0'])
data_df

### Encoding categorical features

In [None]:
df.dtypes

In [None]:
df.Stage.dtype == 'object'

In [None]:
feature = 'Attribute'

In [None]:
# Clean the column's string values to have the same, standard format
tmp_df = du.data_processing.clean_naming(df, feature)
tmp_df.head()

In [None]:
# Get the unique values of the cateforical feature
unique_values = df[feature].unique()
if 'dask' in str(type(df)):
    # Make sure that the unique values are computed, in case we're using Dask
    unique_values = unique_values.compute()
# Enumerate the unique values in the categorical feature and put them in a dictionary
enum_dict = du.embedding.create_enum_dict(unique_values)

In [None]:
enum_dict

In [None]:
# Create a series from the enumerations of the original feature's categories
enum_series = df[feature].map(lambda x: du.utils.apply_dict_convertion(x, enum_dict, nan_value=0), meta=('x', int))
enum_series.head()

In [None]:
df = df.compute()

In [None]:
# [TODO] Sometimes in Dask, and only sometimes, for some reason this doesn't work, 
# as it considers that 'Attribute' column has the same unique values as 'Type' ¯\_(ツ)_/¯
# Try to find out what's wrong
feat_enum = dict()
for feature in df.columns:
    if df[feature].dtype == 'object':
        # Prepare for embedding, i.e. enumerate categories
        df[feature], feat_enum[feature] = du.embedding.enum_categorical_feature(df, feature)
df.head()

In [None]:
feat_enum

### Create a dataset object:

In [None]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, df, label_name=None):
        # Counter that indicates in which column we're in when searching for the label column
        col_num = 0
        for col in df.columns:
            if 'label' in col or col == label_name:
                # Column number corresponding to the label
                self.label_column = col_num
                break
            col_num += 1
        # Convert the data into a NumPy array
        data = df.to_numpy()
        # Column numbers corresponding to the features
        self.features_columns = list(range(self.label_column)) + list(range(self.label_column + 1, data.shape[1]))
        # Features
        self.X = data[:, self.features_columns]
        # Labels
        self.y = data[:, self.label_column]

    def __getitem__(self, item):
        x = self.X[item]
        y = self.y[item]
        return x, y

    def __len__(self):
        return len(self.X)

In [None]:
dataset = Dataset(df, 'Attribute')

In [None]:
dataset.__len__()

In [None]:
dataset.X

In [None]:
dataset.y

In [None]:
dataset.__getitem__(0)

## Testing data utils methods and their reproducibility

### Separating in different sets

In [None]:
train_dataloader, val_dataloader, test_dataloader = du.machine_learning.create_train_sets(dataset, get_indeces=False)

In [None]:
val_dataloader

In [None]:
next(iter(val_dataloader))

In [None]:
for x, y in val_dataloader:
    print(y)

Everything seems to be working great! (at least with the NumPy's random seed; still need to confirm on PyTorch)

### Converting string encodings to numeric

In [None]:
data_df_num = du.embedding.string_encod_to_numeric(data_df)
data_df_num

In [None]:
data_df_num.dtypes