In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
data_root = Path('/pio/scratch/1/recommender_systems/')
project_root = Path('/pio/scratch/1/i308362/NGCF/')

In [3]:
def create_k_core(pdf, k, k_users=None, k_items=None):
    if k_users is None:
        k_users = k
    if k_items is None:
        k_items = k

    while np.any(pdf['asin'].value_counts() < k_items) or np.any(pdf['reviewerID'].value_counts() < k_users):
        items_to_keep = pdf['asin'].value_counts().where(lambda x: x >= k_items).dropna().index
        users_to_keep = pdf['reviewerID'].value_counts().where(lambda x: x >= k_users).dropna().index
        pdf = pdf[pdf['asin'].isin(items_to_keep) & pdf['reviewerID'].isin(users_to_keep)]

    return pdf

# Clothes

## Creating 10-core

In [21]:
clothes_pdf = pd.read_parquet(data_root / 'interim' / 'Amazon' / 'Clothing_Shoes_and_Jewelry_clean.parquet')

In [None]:
clothes_10_core_pdf = create_k_core(clothes_pdf, 10)

In [None]:
clothes_10_core_pdf.to_parquet(project_root / 'Data' / 'clothes_10_core.parquet')

## Inactive users variant a - train test split

In [4]:
clothes_10_core_pdf = pd.read_parquet(project_root / 'Data' / 'clothes_10_core.parquet')

In [8]:
test_users = np.random.choice(clothes_10_core_pdf['reviewerID'].value_counts().where(lambda x: x == 10).dropna().index, 10000, replace=False)

In [13]:
train_pdf = []
test_pdf = []

for user, pdf in clothes_10_core_pdf.groupby('reviewerID'):
    if user not in test_users:
        train_pdf.append(pdf)
    else:
        sample_pdf = pdf.sample(5)
        test_pdf.append(sample_pdf)
        train_pdf.append(pdf[~pdf.index.isin(sample_pdf.index)])

train_pdf = pd.concat(train_pdf)
test_pdf = pd.concat(test_pdf)

In [19]:
train_pdf.to_parquet(project_root / 'Data' / 'inactive_a_train.parquet')
test_pdf.to_parquet(project_root / 'Data' / 'inactive_a_test.parquet')

## Inactive users variant b

In [4]:
def get_inactive(pdf, k_users, k_items, max_users):
    while(
        np.any(pdf['asin'].value_counts() < k_items)
        or np.any(pdf['reviewerID'].value_counts() < k_users)
        or np.any(pdf['reviewerID'].value_counts() > max_users)
    ):
        items_to_keep = pdf['asin'].value_counts().where(lambda x: x >= k_items).dropna().index
        users_to_keep = pdf['reviewerID'].value_counts().where(lambda x: (k_users <= x) & (x <= max_users)).dropna().index
        pdf = pdf[pdf['asin'].isin(items_to_keep) & pdf['reviewerID'].isin(users_to_keep)]

    return pdf

In [5]:
clothes_pdf = pd.read_parquet(data_root / 'interim' / 'Amazon' / 'Clothing_Shoes_and_Jewelry_clean.parquet')

In [9]:
inactive_pdf = get_inactive(clothes_pdf, 7, 10, 15)

In [13]:
test_users = np.random.choice(inactive_pdf['reviewerID'].value_counts().where(lambda x: x == 10).dropna().index, 10000, replace=False)

In [14]:
train_pdf = []
test_pdf = []

for user, pdf in inactive_pdf.groupby('reviewerID'):
    if user not in test_users:
        train_pdf.append(pdf)
    else:
        sample_pdf = pdf.sample(5)
        test_pdf.append(sample_pdf)
        train_pdf.append(pdf[~pdf.index.isin(sample_pdf.index)])

train_pdf = pd.concat(train_pdf)
test_pdf = pd.concat(test_pdf)

In [18]:
train_pdf.to_parquet(project_root / 'Data' / 'inactive_b_train.parquet')
test_pdf.to_parquet(project_root / 'Data' / 'inactive_b_test.parquet')