In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import logging
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Specify the log message format
    datefmt='%Y-%m-%d %H:%M:%S',  # Specify the date format
    handlers=[
        #logging.FileHandler('app.log'),  # Log messages to a file
        logging.StreamHandler()  # Also output log messages to the console
    ]
)
logger = logging.getLogger(__name__)

In [None]:
from relbench.datasets import get_dataset
dataset = get_dataset("rel-hm", download=True)

In [24]:
customer_file = '/mnt/data/rel-hm/db/customer.parquet'
transactions_file = '/mnt/data/rel-hm/db/transactions.parquet'
article_file = '/mnt/data/rel-hm/db/article.parquet'

# Load the data
import pandas as pd
customer = pd.read_parquet(customer_file)
transactions = pd.read_parquet(transactions_file)
article = pd.read_parquet(article_file)

In [3]:
# get statistics of the data
print("Customer data")
print(customer.head())
print(customer.describe())
print("\nArticle data")
print(article.head())
print(article.describe())
print("\nTransactions data")
print(transactions.head())
print(transactions.describe())

Customer data
   customer_id   FN  Active club_member_status fashion_news_frequency   age  \
0            0  NaN     NaN             ACTIVE                   NONE  49.0   
1            1  NaN     NaN             ACTIVE                   NONE  25.0   
2            2  NaN     NaN             ACTIVE                   NONE  24.0   
3            3  NaN     NaN             ACTIVE                   NONE  54.0   
4            4  1.0     1.0             ACTIVE              Regularly  52.0   

                                         postal_code  
0  52043ee2162cf5aa7ee79974281641c6f11a68d276429a...  
1  2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...  
2  64f17e6a330a85798e4998f62d0930d14db8db1c054af6...  
3  5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...  
4  25fa5ddee9aac01b35208d01736e57942317d756b32ddd...  
         customer_id        FN    Active           age
count      1371980.0  476930.0  464404.0  1.356119e+06
mean        685989.5       1.0       1.0  3.638696e+01
std    396056.65

In [17]:
# num of columns
print("\nNumber of columns")
print("Customer: ", len(customer.columns))
print("Article: ", len(article.columns))
print("Transactions: ", len(transactions.columns))

# num rows
print("\nNumber of rows")
print("Customer: ", len(customer))
print("Article: ", len(article))
print("Transactions: ", len(transactions))



Number of columns
Customer:  7
Article:  25
Transactions:  5

Number of rows
Customer:  1371980
Article:  105542
Transactions:  15453651


In [25]:
# merge all the data into single dataframe using customer_id and article_id
df = pd.merge(transactions, customer, on='customer_id')
df = pd.merge(df, article, on='article_id')

In [6]:
# check if customer and article ids overlap
print(len(set(df['customer_id']).intersection(set(df['article_id']))))

52972


In [26]:
# renumber the customer and article ids so that they are unique and continuous
customer_ids = list(set(df['customer_id']))
customer_ids.sort()
customer_id_map = {customer_ids[i]: i for i in range(len(customer_ids))}
article_ids = list(set(df['article_id']))
article_ids.sort()
article_id_map = {article_ids[i]: i + len(customer_ids) for i in range(len(article_ids))}
df['customer_id'] = df['customer_id'].map(customer_id_map)
df['article_id'] = df['article_id'].map(article_id_map)

In [33]:
# convert t_dat column from numpy.datetime64 to int
df['t_dat'] = df['t_dat'].astype('int64') // 10**9

In [35]:
# print t_dat statistics
print("\nT_DAT statistics")
df['t_dat'] = df['t_dat'] 
print(df['t_dat'].describe())
print(df.head())


T_DAT statistics
count    1.545365e+07
mean     1.584823e+09
std      9.501517e+06
min      1.567814e+09
25%      1.576454e+09
50%      1.585872e+09
75%      1.592957e+09
max      1.600733e+09
Name: t_dat, dtype: float64
        t_dat  customer_id  article_id     price  sales_channel_id  FN  \
0  1567814400          113     1031926  0.010153                 1 NaN   
1  1567814400          113     1031926  0.010153                 1 NaN   
2  1567814400          113     1058280  0.042356                 1 NaN   
3  1567814400          113     1010849  0.005068                 1 NaN   
4  1567814400          113     1053735  0.033881                 1 NaN   

   Active club_member_status fashion_news_frequency   age  ...  \
0     NaN             ACTIVE                   NONE  43.0  ...   
1     NaN             ACTIVE                   NONE  43.0  ...   
2     NaN             ACTIVE                   NONE  43.0  ...   
3     NaN             ACTIVE                   NONE  43.0  ...   
4  

In [16]:
df.to_csv('/mnt/data/rel-hm/dummy.csv', index=False)

   t_dat  customer_id  article_id     price  sales_channel_id  FN  Active  \
0      1            0        4899  0.010153                 1 NaN     NaN   
1      1            0        4899  0.010153                 1 NaN     NaN   
2      1            0        7600  0.042356                 1 NaN     NaN   
3      1            0        3397  0.005068                 1 NaN     NaN   
4      1            0        7267  0.033881                 1 NaN     NaN   

  club_member_status fashion_news_frequency   age  ... department_name  \
0             ACTIVE                   NONE  43.0  ...   Tops Knitwear   
1             ACTIVE                   NONE  43.0  ...   Tops Knitwear   
2             ACTIVE                   NONE  43.0  ...      Dresses DS   
3             ACTIVE                   NONE  43.0  ...   Tops Knitwear   
4             ACTIVE                   NONE  43.0  ...    Jersey fancy   

   index_code  index_name  index_group_no index_group_name section_no  \
0           D     D

In [36]:
# print column names and types
print("\nColumn names and types")
print(df.dtypes)


Column names and types
t_dat                             int64
customer_id                       int64
article_id                        int64
price                           float64
sales_channel_id                  int64
FN                              float64
Active                          float64
club_member_status               object
fashion_news_frequency           object
age                             float64
postal_code                      object
product_code                      int64
prod_name                        object
product_type_no                   int64
product_type_name                object
product_group_name               object
graphical_appearance_no           int64
graphical_appearance_name        object
colour_group_code                 int64
colour_group_name                object
perceived_colour_value_id         int64
perceived_colour_value_name      object
perceived_colour_master_id        int64
perceived_colour_master_name     object
department_no   

In [37]:
# print columns that have missing values
print(df.isnull().sum())

t_dat                                 0
customer_id                           0
article_id                            0
price                                 0
sales_channel_id                      0
FN                              8755179
Active                          8849707
club_member_status                27500
fashion_news_frequency            75406
age                               59198
postal_code                           0
product_code                          0
prod_name                             0
product_type_no                       0
product_type_name                     0
product_group_name                    0
graphical_appearance_no               0
graphical_appearance_name             0
colour_group_code                     0
colour_group_name                     0
perceived_colour_value_id             0
perceived_colour_value_name           0
perceived_colour_master_id            0
perceived_colour_master_name          0
department_no                         0


In [None]:
# does every article_id have the same prod_name
print(df.groupby('article_id')['prod_name'].nunique().value_counts())

In [38]:
# store the merged data in a csv file
df.to_csv('/mnt/data/rel-hm/rel-hm.csv', index=False)

: 

In [39]:
import time
from src.datasets.util.mask import PretrainType
from src.datasets import RelHM
dataset = RelHM(
    root='/mnt/data/rel-hm/rel-hm.csv', 
    pretrain={PretrainType.MASK, PretrainType.LINK_PRED},
    split_type='daily_temporal',
    splits=[0.6,0.2,0.2], 
    khop_neighbors=[100,100],
    ports=True
)
logger.info(f"Materialzing dataset...")
s = time.time()
dataset.materialize()
logger.info(f"Materialized in {time.time() - s:.2f} seconds")
dataset.df.head(5)
num_columns = len(dataset.num_columns)
cat_columns = len(dataset.cat_columns)
logger.info(f"Number of numerical columns: {num_columns}")
logger.info(f"Number of categorical columns: {cat_columns}")

2024-08-05 11:34:39 - INFO - Creating graph...
2024-08-05 11:35:21 - INFO - Graph created in 42.54 seconds.
2024-08-05 11:35:21 - INFO - Adding ports...
2024-08-05 11:50:51 - INFO - Ports added in 929.89 seconds.
2024-08-05 11:50:51 - INFO - Creating masked columns and saving to /mnt/data/rel-hm/rel-hm.csv.npy


In [11]:
df_train = df.sample(frac=0.2)
len(df_train)

In [13]:
# renumber the customer and article ids so that they are unique and continuous
customer_ids = list(set(df_train['customer_id']))
customer_ids.sort()
customer_id_map = {customer_ids[i]: i for i in range(len(customer_ids))}
article_ids = list(set(df_train['article_id']))
article_ids.sort()
article_id_map = {article_ids[i]: i + len(customer_ids) for i in range(len(article_ids))}
df_train['customer_id'] = df_train['customer_id'].map(customer_id_map)
df_train['article_id'] = df_train['article_id'].map(article_id_map)

In [14]:
df_train.to_csv('/mnt/data/rel-hm/rel-hm-3M.csv', index=False)