# Joining rows in very, very, very large lists of categories
---

## Import the necessary packages

In [1]:
import pandas as pd                        # Pandas to load the data initially
# import modin.pandas as pd                  # Optimized distributed version of Pandas
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import data_utils as du                    # Data science and machine learning relevant methods



In [2]:
du.set_pandas_library('pandas')

Allow pandas to show more columns:

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [4]:
import pixiedust                           # Debugging in Jupyter Notebook cells

Pixiedust database opened successfully


## Initialize variables

Data that we'll be using:

In [5]:
id_col = np.concatenate([np.repeat(1, 25), 
                         np.repeat(2, 17), 
                         np.repeat(3, 56), 
                         np.repeat(4, 138), 
                         np.repeat(5, 2000), 
                         np.repeat(6, 100000)])
id_col

array([1, 1, 1, ..., 6, 6, 6])

In [6]:
ts_col = np.concatenate([np.repeat(0, 25), 
                         np.repeat(0, 17), 
                         np.repeat(0, 16), np.repeat(1, 40), 
                         np.repeat(0, 30), np.repeat(1, 8), np.repeat(2, 50), np.repeat(3, 50), 
                         np.repeat(0, 1000), np.repeat(1, 1000),
                         np.repeat(0, 100000)])
ts_col

array([0, 0, 0, ..., 0, 0, 0])

In [7]:
categ_col = np.concatenate([np.random.randint(0, 50, size=(102200)), np.repeat(np.nan, 36)])
np.random.shuffle(categ_col)
categ_col

array([44., 33., 27., ..., 48.,  7.,  8.])

In [8]:
data = np.column_stack([id_col, ts_col, categ_col])
data

array([[ 1.,  0., 44.],
       [ 1.,  0., 33.],
       [ 1.,  0., 27.],
       ...,
       [ 6.,  0., 48.],
       [ 6.,  0.,  7.],
       [ 6.,  0.,  8.]])

In [9]:
data_df = pd.DataFrame(data, columns=['id', 'ts', 'categ_var'])
data_df

Unnamed: 0,id,ts,categ_var
0,1.0,0.0,44.0
1,1.0,0.0,33.0
2,1.0,0.0,27.0
3,1.0,0.0,1.0
4,1.0,0.0,33.0
...,...,...,...
102231,6.0,0.0,31.0
102232,6.0,0.0,34.0
102233,6.0,0.0,48.0
102234,6.0,0.0,7.0


In [10]:
# data_df.head(1000)

In [11]:
data_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102236 entries, 0 to 102235
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         102236 non-null  float64
 1   ts         102236 non-null  float64
 2   categ_var  102200 non-null  float64
dtypes: float64(3)
memory usage: 2.3 MB


In [12]:
# data_df = data_df.astype({'a': 'object'})

In [13]:
dtype_dict = {'id': 'UInt8', 'ts': 'float32', 'categ_var': 'Int32'}
dtype_dict

{'id': 'UInt8', 'ts': 'float32', 'categ_var': 'Int32'}

In [14]:
data_df = du.utils.convert_dtypes(data_df, dtypes=dtype_dict, inplace=True)

In [16]:
data_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102236 entries, 0 to 102235
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         102236 non-null  UInt8  
 1   ts         102236 non-null  float32
 2   categ_var  102200 non-null  Int32  
dtypes: Int32(1), UInt8(1), float32(1)
memory usage: 1.1 MB


In [None]:
# data_df = du.utils.convert_pyarrow_dtypes(data_df, inplace=True)

In [19]:
data_df.dtypes

id             UInt8
ts           float32
categ_var      Int32
dtype: object

In [17]:
data_df.to_feather('data_df.ftr')

In [18]:
pd.read_feather('data_df.ftr')

Unnamed: 0,id,ts,categ_var
0,1,0.0,44.0
1,1,0.0,33.0
2,1,0.0,27.0
3,1,0.0,1.0
4,1,0.0,33.0
...,...,...,...
102231,6,0.0,31.0
102232,6,0.0,34.0
102233,6,0.0,48.0
102234,6,0.0,7.0


## Enumerate categories

In [None]:
# data_df.categ_var, enum_dict = du.embedding.enum_categorical_feature(data_df, 'categ_var', nan_value=0, forbidden_digit=0)
# data_df

In [None]:
# enum_dict

In [None]:
%%time
pd.Categorical(data_df['categ_var'])

In [None]:
%%time
x1 = pd.get_dummies(data_df, columns=['categ_var'])
x1.head()

In [None]:
x1.dtypes

In [None]:
%%time
x2 = pd.get_dummies(data_df, columns=['categ_var'], sparse=True)
x2.head()

In [None]:
x2.dtypes

In [None]:
%%time
x3, _ = du.data_processing.one_hot_encoding_dataframe(data_df, columns='categ_var', join_rows=False, 
                                                      get_new_column_names=True, inplace=False)
x3.head()

In [None]:
x4.dtypes

In [None]:
%%time
x4, _ = du.data_processing.one_hot_encoding_dataframe(data_df, columns='categ_var', join_rows=False, 
                                                      get_new_column_names=True, inplace=True)
x4.head()

In [None]:
x5.dtypes

In [None]:
data_df, ohe_columns = du.data_processing.one_hot_encoding_dataframe(data_df, columns='categ_var', join_rows=False, 
                                                                     get_new_column_names=True, inplace=True)
data_df

In [None]:
ohe_columns

In [None]:
data_df.info(memory_usage='deep')

In [None]:
data_df[ohe_columns] = data_df[ohe_columns].astype('boolean')

In [None]:
data_df.info(memory_usage='deep')

## Join rows

### Use all values

In [None]:
data_df = du.embedding.join_repeated_rows(data_df, cat_feat='categ_var', id_columns=['id', 'ts'], unique=False, inplace=True)
data_df

#### Convert to a PyTorch tensor

In [None]:
data_tnsr = torch.from_numpy(data_df.values)
data_tnsr

In [None]:
torch.max(data_tnsr)

#### Save and load the dataframe again

In [None]:
data_df.to_csv('random_big_data.csv')

In [None]:
data_df.read_csv('random_big_data.csv')
data_df = data_df.drop(columns='Unnamed: 0')
data_df

In [None]:
data_df.info(memory_usage='deep')

### Only use unique category occurences

In [None]:
# data_df_set = data_df.groupby(['id', 'ts']).categ_var.apply(lambda df: set(df)).reset_index()
# data_df_set

In [None]:
# data_df_set.info(memory_usage='deep')

In [None]:
data_df = du.embedding.join_repeated_rows(data_df, cat_feat='categ_var', id_columns=['id', 'ts'], unique=True, inplace=True)
data_df

In [None]:
data_df.info(memory_usage='deep')

**Comment:** It would be much more pleasant to the eye (and easier to handle in code) if the categorical column was just converted to a series of `set` objects. However, as we can see from the info outputs, using sets instead of large int values can make the dataframe use much more memory (depends on the data, but it can even use 200 times more memory).

In [None]:
data_df.categ_var.max()

In [None]:
du.utils.get_full_number_string(data_df.categ_var.astype('float64').max())

In [None]:
int(data_df.categ_var.astype('float64')[0])

#### Convert to a PyTorch tensor

In [None]:
data_df.iloc[10].values

In [None]:
data_df.categ_var.values

In [None]:
data_tnsr = torch.from_numpy(data_df.values)
data_tnsr

In [None]:
torch.max(data_tnsr)

#### Save and load the dataframe again

In [None]:
data_df.to_csv('random_big_data.csv')

In [None]:
data_df = pd.read_csv('random_big_data.csv')
data_df = data_df.drop(columns='Unnamed: 0')
data_df

In [None]:
data_df.info(memory_usage='deep')

**Comments:**
* It looks like filtering for the unique category occurences might help, but then to pass it into a NumPy tensor it won't work, even if it just has 8 different categories in a list.
* I might need to use the very own dataframe in the PyTorch dataloader, extract the encodings to create the embedding columns and only then convert it to a PyTorch tensor.

## Create a dataframe dataloader