Intro:

## Data Loading:

In [1]:
from pathlib import Path
import pickle


DATA_PATH = Path('../data/')
ARTEFACTS_PATH = Path("../artefacts")


# Task 1 utilities:
TASK_1_DATASET = DATA_PATH / Path("instacart-market-basket-analysis/data/instacart-market-basket-analysis")
TASK_1_ARTEFACTS_PATH = ARTEFACTS_PATH / Path('task_1')

TASK_1_PATH_TO_CROSSTABLE = TASK_1_ARTEFACTS_PATH / Path("user_product_crosstable.npz")
TASK_1_PATH_TO_USER_IDS = TASK_1_ARTEFACTS_PATH / Path("user_ids.pkl")
TASK_1_PATH_TO_PRODUCT_IDS = TASK_1_ARTEFACTS_PATH / Path("product_ids.pkl")

TASK_1_PATH_TO_CROSSTABLE_VAL = TASK_1_ARTEFACTS_PATH / Path("user_product_crosstable_val.npz")
TASK_1_PATH_TO_USER_IDS_VAL = TASK_1_ARTEFACTS_PATH / Path("user_ids_val.pkl")
TASK_1_PATH_TO_PRODUCT_IDS_VAL = TASK_1_ARTEFACTS_PATH / Path("product_ids_val.pkl")

TASK_1_PATH_TO_USER_VECTORS = TASK_1_ARTEFACTS_PATH / Path("svd/user_vectors.pkl")
TASK_1_PATH_TO_PRODUCT_VECTORS = TASK_1_ARTEFACTS_PATH / Path("svd/product_vectors.pkl")
TASK_1_PATH_TO_SIGMA_VECTORS = TASK_1_ARTEFACTS_PATH / Path("svd/sigma_vectors.pkl")


TASK_1_PATH_TO_USER_VECTORS_VAL = TASK_1_ARTEFACTS_PATH / Path("svd/user_vectors_val.pkl")
TASK_1_PATH_TO_PRODUCT_VECTORS_VAL = TASK_1_ARTEFACTS_PATH / Path("svd/product_vectors_val.pkl")

## Data preparation:

In [2]:
from functools import partial

import pandas as pd
import numpy as np


def get_train_and_test_orders():

    # 'Products' datasets:
    products = pd.read_csv(TASK_1_DATASET / 'products.csv/products.csv')
    aisles = pd.read_csv(TASK_1_DATASET / 'aisles.csv/aisles.csv')
    departments = pd.read_csv(TASK_1_DATASET / 'departments.csv/departments.csv')

    # Orders datasets:
    orders = pd.read_csv(TASK_1_DATASET / 'orders.csv/orders.csv')
    orderProductsTrain = pd.read_csv(TASK_1_DATASET / 'order_products__train.csv/order_products__train.csv')
    orderProductsPrior = pd.read_csv(TASK_1_DATASET / 'order_products__prior.csv/order_products__prior.csv')

    goods = add_departments_and_aisles_info(products=products, departments=departments, aisles=aisles)

    orders_detailed_train = combine_orders_info(orders=orders, orders_products=orderProductsTrain)
    orders_detailed_train = merge_orders_and_goods(order_detailed=orders_detailed_train, goods=goods)

    orders_detailed_prior = combine_orders_info(orders=orders, orders_products=orderProductsPrior)
    orders_detailed_prior = merge_orders_and_goods(order_detailed=orders_detailed_prior, goods=goods)

    orders_detailed = concat_prior_and_train_orders(
        order_detailed_train=orders_detailed_train, order_detailed_prior=orders_detailed_prior
    )
    return orders_detailed, orders.query("eval_set == 'test'")


def add_departments_and_aisles_info(products: pd.DataFrame, departments: pd.DataFrame, aisles: pd.DataFrame):
    # combine aisles, departments and products (left joined to products)
    goods = pd.merge(
        left=pd.merge(
            left=products, right=departments, how='left'
        ),
        right=aisles,
        how='left'
    )
    # to retain '-' and make product names more "standard"
    goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()
    return goods


def combine_orders_info(orders: pd.DataFrame, orders_products: pd.DataFrame):
    # initialize it with train dataset
    order_details = pd.merge(
        left=orders_products,
        right=orders,
        how='left',
        on='order_id'
    ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
    return order_details


def merge_orders_and_goods(order_detailed: pd.DataFrame, goods: pd.DataFrame):
    # add order hierarchy
    order_detailed = pd.merge(
        left=order_detailed.copy(),
        right=goods[['product_id',
                     'aisle_id',
                     'department_id']].apply(partial(pd.to_numeric,
                                                     errors='ignore',
                                                     downcast='integer')),
        how='left',
        on='product_id'
    )
    return order_detailed


def concat_prior_and_train_orders(order_detailed_train: pd.DataFrame, order_detailed_prior: pd.DataFrame):
    order_detailed_train = order_detailed_train.copy()
    indexes = np.linspace(0, len(order_detailed_prior), num=10, dtype=np.int32)

    for i in range(len(indexes) - 1):
        order_detailed_train = pd.concat([order_detailed_train, order_detailed_prior.iloc[indexes[i]:indexes[i+1], :]])
    return order_detailed_train


def get_last_order_for_users(order_detailed: pd.DataFrame) -> pd.DataFrame:
    mask = order_detailed.groupby("user_id")["order_number"].transform(max) == order_detailed['order_number']
    last_orders = order_detailed.loc[mask]
    return last_orders


In [3]:
orders_detailed, orders_detailed_test = get_train_and_test_orders()

In [4]:
orders_detailed.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id
0,1,49302,1,1,112108,train,4,4,10,9.0,120,16
1,1,11109,2,1,112108,train,4,4,10,9.0,108,16
2,1,10246,3,0,112108,train,4,4,10,9.0,83,4
3,1,49683,4,0,112108,train,4,4,10,9.0,83,4
4,1,43633,5,1,112108,train,4,4,10,9.0,95,15


In [5]:
orders_detailed_test.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


# Split dataset into train and valid:  
Normally, that should be implemented by time-series split. But since there are no timestamps, lets take 'prior' dataset as our train and the 'train' dataset as validation.  
Moreover, let's keep the same set of items in both new subsets.

In [6]:
# 'split':
train_set = orders_detailed.query("eval_set == 'prior'")
val_set = orders_detailed.query("eval_set == 'train'")

In [7]:
# Keep products only from train:
val_set = val_set.query("product_id in @train_set.product_id")

In [8]:
train_set.shape, val_set.shape

((32434489, 12), (1384608, 12))

# Vectors generation pipeline:
1. Create crosstab of USERxPRODUCT pairs; binary counts
2. SVD of it
3. Use vectors as USER and PRODUCT embeddings

In [11]:
def save_artefact(artefact, artefact_path: Path):
    with open(artefact_path, 'wb') as f:
        pickle.dump(artefact, f)

## Create crosstabs (uncomment if want to regenerate):  
The idea is in simulation that the data is recieved in the time-series manner:
- Firstly the `train` set was avaliable, the crosstab created on that data
- Then new data (`valid` set) was avaliable, so the new crosstab will be created on the train+validation data

In [12]:
from scipy.stats.contingency import crosstab


def create_crosstab_artefacts(dataset: pd.DataFrame, crosstab_path, user_ids_path, product_ids_path):

    (user_ids, product_ids), X = crosstab(
        dataset["user_id"].values, dataset["product_id"].values, sparse=True
    )

    # To binary representation of counts:
    X = X != 0
    X = X.astype('int')
    
    # Save:
    paths = [crosstab_path, user_ids_path, product_ids_path]
    objs = [X, user_ids, product_ids]
    for path, obj in zip(paths, objs):
        save_artefact(artefact=obj, artefact_path=path)

In [13]:
# Create crosstab and IDs for train subset:
create_crosstab_artefacts(train_set, crosstab_path=TASK_1_PATH_TO_CROSSTABLE, user_ids_path=TASK_1_PATH_TO_USER_IDS, product_ids_path=TASK_1_PATH_TO_PRODUCT_IDS)

In [19]:
# Create crosstab and IDs for train+val subset:
create_crosstab_artefacts(train_set.append(val_set), crosstab_path=TASK_1_PATH_TO_CROSSTABLE_VAL, user_ids_path=TASK_1_PATH_TO_USER_IDS_VAL, product_ids_path=TASK_1_PATH_TO_PRODUCT_IDS_VAL)

## Load crosstab artefacts:

In [9]:
import numpy as np


def load_crosstab_artefacts(path_to_crosstable: Path, user_ids_filepath: Path, product_ids_filepath: Path):
    if path_to_crosstable.exists() and user_ids_filepath.exists() and product_ids_filepath.exists():
        return load_pickled(path_to_crosstable), load_pickled(user_ids_filepath), load_pickled(product_ids_filepath)
    return None, None, None


def load_pickled(filepath: Path):
    with open(filepath, 'rb') as f:
        return pickle.load(f)


    
X_train, user_ids_train, product_ids_train = load_crosstab_artefacts(
    TASK_1_PATH_TO_CROSSTABLE, 
    TASK_1_PATH_TO_USER_IDS,
    TASK_1_PATH_TO_PRODUCT_IDS
)


X_val, user_ids_val, product_ids_val = load_crosstab_artefacts(
    TASK_1_PATH_TO_CROSSTABLE_VAL, 
    TASK_1_PATH_TO_USER_IDS_VAL,
    TASK_1_PATH_TO_PRODUCT_IDS_VAL
)


## Generate SVD vectors (uncomment if want to regenerate):  
- Firstly the `train` set was avaliable, the crosstab created on that data, so SVD will be performed on that. These vectors will be used in model's training
- Then new data (`valid` set) was avaliable, so the new crosstab will be created on the train+validation data and now SVD will be performed on that also. These vectors will be used in model's validation (only for validation subset)

In [20]:
from sklearn.utils.extmath import randomized_svd



def generate_svd_vectors(crosstab, n_components: int, user_vectors_path, product_vectors_path):

    u, _, v = randomized_svd(crosstab, n_components=n_components)
    
    paths = [user_vectors_path, product_vectors_path]
    objs = [u, v]
    for path, obj in zip(paths, objs):
        save_artefact(artefact=obj, artefact_path=path)

In [24]:
n_components = 500

generate_svd_vectors(X_train, n_components=n_components, user_vectors_path=TASK_1_PATH_TO_USER_VECTORS, product_vectors_path=TASK_1_PATH_TO_PRODUCT_VECTORS)
generate_svd_vectors(X_val, n_components=n_components, user_vectors_path=TASK_1_PATH_TO_USER_VECTORS_VAL, product_vectors_path=TASK_1_PATH_TO_PRODUCT_VECTORS_VAL)

# Neural network pipeline:  
1. For TRAIN subset: get user vectors, get product vectors
2. Predict whether reordered or not
3. Negative sampling?
4. Predict on test set

In [10]:
# Load train vectors: 
with open(TASK_1_PATH_TO_USER_VECTORS, 'rb') as f:
    user_vectors = pickle.load(f)
    
with open(TASK_1_PATH_TO_PRODUCT_VECTORS, 'rb') as f:
    product_vectors = pickle.load(f)
    
    
# Load valid vectors: 
with open(TASK_1_PATH_TO_USER_VECTORS_VAL, 'rb') as f:
    user_vectors_val = pickle.load(f)
    
with open(TASK_1_PATH_TO_PRODUCT_VECTORS_VAL, 'rb') as f:
    product_vectors_val = pickle.load(f)

## Dataset init:

In [11]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class UserProductDataset(Dataset):
    def __init__(
        self, 
        dataframe: pd.DataFrame,
        user_vectors: np.ndarray, 
        product_vectors: np.ndarray,
        user_ids: np.ndarray, 
        product_ids: np.ndarray
    ):
        self.order_id_samples = dataframe['order_id'].tolist()
        self.user_id_samples = dataframe['user_id'].tolist()
        self.product_id_samples = dataframe['product_id'].tolist()
        self.labels = dataframe['reordered'].tolist()
        
        self.user_ids = user_ids
        self.product_ids = product_ids
        
        self.user_id_to_idx = {id_: idx for idx, id_ in enumerate(self.user_ids)}
        self.product_id_to_idx = {id_: idx for idx, id_ in enumerate(self.product_ids)}
        
        self.user_vectors = user_vectors
        if product_vectors.shape[0] == user_vectors.shape[1]:
            self.product_vectors = product_vectors.T
        else:
            self.product_vectors = product_vectors

    def __len__(self):
        return len(self.order_id_samples)

    def __getitem__(self, idx):
        user_id = self.user_id_samples[idx]
        product_id = self.product_id_samples[idx]
        
        user_idx = self.user_id_to_idx[user_id]
        product_idx = self.product_id_to_idx[product_id]
        
        user_vector = self.user_vectors[user_idx]
        product_vector = self.product_vectors[product_idx]
        
        label = self.labels[idx]
        
        return user_vector, product_vector, label

In [12]:
val_set

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id
0,1,49302,1,1,112108,train,4,4,10,9.0,120,16
1,1,11109,2,1,112108,train,4,4,10,9.0,108,16
2,1,10246,3,0,112108,train,4,4,10,9.0,83,4
3,1,49683,4,0,112108,train,4,4,10,9.0,83,4
4,1,43633,5,1,112108,train,4,4,10,9.0,95,15
...,...,...,...,...,...,...,...,...,...,...,...,...
1384612,3421063,14233,3,1,169679,train,30,0,10,4.0,115,7
1384613,3421063,35548,4,1,169679,train,30,0,10,4.0,13,20
1384614,3421070,35951,1,1,139822,train,15,6,10,8.0,91,16
1384615,3421070,16953,2,1,139822,train,15,6,10,8.0,88,13


In [13]:
train_dataset = UserProductDataset(
    dataframe=train_set,
    user_vectors=user_vectors,
    product_vectors=product_vectors,
    user_ids=user_ids_train,
    product_ids=product_ids_train
)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)


val_dataset = UserProductDataset(
    dataframe=val_set,
    user_vectors=user_vectors_val,
    product_vectors=product_vectors_val,
    user_ids=user_ids_val,
    product_ids=product_ids_val
)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True)

## Networks init:
1. Cosine Similarity as output
2. Softmax as output

In [88]:
import torch
from torch import nn


class CosineRecommender(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(CosineRecommender, self).__init__()
        self.user_fc = nn.Linear(input_size, hidden_size)  
        self.product_fc = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.output = nn.CosineSimilarity()

    def forward(self, users_inputs, products_input):
        uv = self.relu(self.user_fc(users_inputs))
        pv = self.relu(self.product_fc(products_input))
        out = self.output(uv, pv)
        return out
    
    
class SigmoidRecommender(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(SigmoidRecommender, self).__init__()
        self.user_fc = nn.Linear(input_size, hidden_size) 
        self.product_fc = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(hidden_size*2, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, users_inputs, products_input):
        uv = self.relu(self.user_fc(users_inputs))
        pv = self.relu(self.product_fc(products_input))
        x = torch.cat([uv, pv], dim=1)
        x = self.fc1(x)
        x = self.sigmoid(x).squeeze(-1)
        return x

## Training and evaluation:

In [56]:
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import f1_score


# Train function:
def train_on_train(model_class, n_epochs: int, input_size: int=500, hidden_size: int=256, output_size: int=1):

    model = model_class(input_size, hidden_size, output_size)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    for _ in range(n_epochs):
        for user_vectors, product_vectors, labels in tqdm(train_dataloader):
            # Convert inputs and labels to tensors
            user_vectors = torch.tensor(user_vectors).float()
            product_vectors = torch.tensor(product_vectors).float()
            labels = torch.tensor(labels).float()

            # Clear gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(user_vectors, product_vectors)
            loss = criterion(outputs, labels)

            # Backward pass and update weights
            loss.backward()
            optimizer.step()

    return model


# Predict function:
def predict_on_val(model, val_dataloader):
    
    with torch.no_grad():
        
        predictions = []
        prodictions_sims = []
        for user_vectors, product_vectors, labels in tqdm(val_dataloader):
        
            user_vectors = torch.tensor(user_vectors).float()
            product_vectors = torch.tensor(product_vectors).float()
            labels = torch.tensor(labels).float()
            
            logits = model(user_vectors, product_vectors)
            outputs = (logits > 0.5).int()
            outputs = outputs.tolist()
            predictions.extend(outputs)
            prodictions_sims.extend(logits.tolist())
    return predictions, prodictions_sims


def calculate_f1(y_true, y_pred):
    f1 = f1_score(y_true, y_pred)
    return f1


# MAP@K:
def precision_at_k(y_true, y_pred, k=5):
    intersection = set(y_pred[:k]) & set(y_true)
    return len(intersection) / k


def average_precision_at_k(y_true, y_pred, k=5):
    hits = 0
    s = 0
    for i in range(len(y_pred[:k])):
        if y_pred[i] in y_true:
            hits += 1
            s += hits / (i + 1)
    if hits == 0:
        return 0
    return s / hits


def map_at_k(y_true, y_pred, k=5):
    assert len(y_true) == len(y_pred)
    return np.mean([
        average_precision_at_k(y_true[i], y_pred[i], k=k)
        for i in range(len(y_true))
    ])


# NDCG:
def ndcg_at_k(y_true, y_pred, k=5):
    ideal_gain = sum([1 / np.log2(i + 2) for i in range(k)])
    dcg = sum([
        1 / np.log2(i + 2)
        for i, rating in enumerate(y_pred[:k])
        if rating in y_true
    ])
    return dcg / ideal_gain


def NDCG_at_k(y_true, y_pred, k=5):
    assert len(y_true) == len(y_pred)
    return np.mean([
        ndcg_at_k(y_true[i], y_pred[i], k=k)
        for i in range(len(y_true))
    ])

### Cosine Similarity Model:

In [57]:
f1_scores = []
mapk_scores = []
ndcgk_scores = []

# n_epochs = [1, 3, 5, 10]  # too slow
n_epochs = [1,]

K = 10


for n_epoch in n_epochs:
    cosine_model = CosineRecommender
    cosine_model = train_on_train(cosine_model, n_epochs=n_epoch)

    cosine_preds, cosine_sims = predict_on_val(cosine_model, val_dataloader)
    f1 = calculate_f1(val_set.reordered, cosine_preds)
    f1_scores.append(f1)
    
    
    # Prepare outputs for MAP@k and other rank-based metrics:
    val_set['cosine_sims'] = cosine_sims
    grouped = val_set.groupby("order_id")
    
    cosine_sims = grouped['cosine_sims'].apply(np.array).values
    cosine_sims = [val.tolist() for val in cosine_sims]

    reordered = grouped['reordered'].apply(np.array).values
    reordered = [val.tolist() for val in reordered]
    
    for i in range(len(cosine_sims)):
        reordered[i], cosine_sims[i] = zip(*sorted(zip(reordered[i], cosine_sims[i]), key=lambda x: x[1]))
        
    mapk = map_at_k(reordered, cosine_sims, k=K)
    ndcgk = NDCG_at_k(reordered, cosine_sims, k=K)
    
    mapk_scores.append(mapk)
    ndcgk_scores.append(ndcgk)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user_vectors, product_vectors, labels in tqdm(train_dataloader):


  0%|          | 0/253395 [00:00<?, ?it/s]

  user_vectors = torch.tensor(user_vectors).float()
  product_vectors = torch.tensor(product_vectors).float()
  labels = torch.tensor(labels).float()
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user_vectors, product_vectors, labels in tqdm(val_dataloader):


  0%|          | 0/10818 [00:00<?, ?it/s]

  user_vectors = torch.tensor(user_vectors).float()
  product_vectors = torch.tensor(product_vectors).float()
  labels = torch.tensor(labels).float()


In [72]:
for i in range(len(mapk_scores)):
    print(f"F1 = {round(f1_scores[i], 3)}, MAP@K = {round(mapk_scores[i], 3)}, NDCG@K = {round(ndcgk_scores[i], 3)}, K={K}, n_epochs = {n_epochs[i]}")

F1 = 0.333, MAP@K = 0.71, NDCG@K = 0.335, K=10, n_epochs = 1


### Sigmoid model:

In [89]:
f1_scores = []
mapk_scores = []
ndcgk_scores = []


# n_epochs = [1, 3, 5, 10]  # too slow
n_epochs = [1,]

K = 10


for n_epoch in n_epochs:
    model = SigmoidRecommender
    model = train_on_train(model, n_epochs=n_epoch)

    preds, cosine_sims = predict_on_val(model, val_dataloader)
    f1 = calculate_f1(val_set.reordered, preds)
    f1_scores.append(f1)
    
    
    # Prepare outputs for MAP@k and other rank-based metrics:
    val_set['cosine_sims'] = cosine_sims
    grouped = val_set.groupby("order_id")
    
    cosine_sims = grouped['cosine_sims'].apply(np.array).values
    cosine_sims = [val.tolist() for val in cosine_sims]

    reordered = grouped['reordered'].apply(np.array).values
    reordered = [val.tolist() for val in reordered]
    
    for i in range(len(cosine_sims)):
        reordered[i], cosine_sims[i] = zip(*sorted(zip(reordered[i], cosine_sims[i]), key=lambda x: x[1]))
        
    mapk = map_at_k(reordered, cosine_sims, k=K)
    ndcgk = NDCG_at_k(reordered, cosine_sims, k=K)
    
    mapk_scores.append(mapk)
    ndcgk_scores.append(ndcgk)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user_vectors, product_vectors, labels in tqdm(train_dataloader):


  0%|          | 0/253395 [00:00<?, ?it/s]

  user_vectors = torch.tensor(user_vectors).float()
  product_vectors = torch.tensor(product_vectors).float()
  labels = torch.tensor(labels).float()
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user_vectors, product_vectors, labels in tqdm(val_dataloader):


  0%|          | 0/10818 [00:00<?, ?it/s]

  user_vectors = torch.tensor(user_vectors).float()
  product_vectors = torch.tensor(product_vectors).float()
  labels = torch.tensor(labels).float()


In [90]:
for i in range(len(mapk_scores)):
    print(f"F1 = {round(f1_scores[i], 3)}, MAP@K = {round(mapk_scores[i], 3)}, NDCG@K = {round(ndcgk_scores[i], 3)}, K={K}, n_epochs = {n_epochs[i]}")

F1 = 0.358, MAP@K = 0.09, NDCG@K = 0.021, K=10, n_epochs = 1


In [77]:
for i in range(len(mapk_scores)):
    print(f"F1 = {round(f1_scores[i], 3)}, MAP@K = {round(mapk_scores[i], 3)}, NDCG@K = {round(ndcgk_scores[i], 3)}, K={K}, n_epochs = {n_epochs[i]}")

F1 = 0.405, MAP@K = 0.057, NDCG@K = 0.023, K=10, n_epochs = 1
