In [186]:
import numpy as np
import pandas as pd
import scipy
import torch
import torch.nn.functional as F

torch.set_grad_enabled(False)

import os
import sys
from torch import save

sys.path.append(os.path.abspath('../src'))

from utils import fetch_train_data
from models import *
from preprocess import *

# Fetch data from course homepage
train_df = fetch_train_data(path='../data/train_data_sample.json')

# Cleanse data
prep = Preprocessor(pipeline=[
    DropColumns(cols=[
        'user_name', 'review', 'review_summary', 'rating', 'brand', 'category',
        'size', 'size_main', 'size_scheme', 'size_suffix', 'price',
        'rented_for', 'usually_wear', 'age', 'body_type'
    ]),
    OrdinalEncoder(cols=['fit', 'cup_size', 'item_name']),
    # StandardScaler(cols=['weight', 'height', 'bust_size', 'cup_size']),
    MedianImputer(cols=['weight', 'height', 'bust_size', 'cup_size']),
])
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)
train_df.reset_index(drop=True, inplace=True)
train_df = prep.handle_size_mapping(train_df)
train_df = prep.fit_transform(train_df)


<class 'preprocess.DropColumns'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MedianImputer'>


In [187]:
train_df

Unnamed: 0,fit,item_name,height,weight,bust_size,cup_size,size_bias
0,1,193,170.18,62.595747,34.0,2.0,0.0
1,2,2753,165.10,56.245454,32.0,6.0,-2.0
2,1,3756,165.10,68.038855,36.0,6.0,2.0
3,1,2374,170.18,68.038855,34.0,3.0,0.0
4,2,519,165.10,72.574779,34.0,9.0,-1.0
...,...,...,...,...,...,...,...
69917,1,2607,160.02,62.595747,32.0,1.0,-2.0
69918,1,586,175.26,62.595747,34.0,4.0,0.5
69919,2,3335,165.10,52.163123,30.0,4.0,-1.5
69920,1,3367,165.10,56.699046,34.0,2.0,-1.5


In [273]:
old_vectors = torch.tensor(train_df.groupby('item_name')[[
    'weight', 'height', 'bust_size', 'cup_size'
]].mean().values,
                           dtype=torch.float32)
old_deviations = torch.ones_like(old_vectors)


class ItemVectorOptimizer:

    def __init__(
            self,
            df: pd.DataFrame,
            w=torch.ones(4, dtype=torch.float32),
            b_1=0.0,
            b_2=0.0,
    ):
        # ground truth
        self.y = torch.tensor(df['fit'].values, dtype=torch.long)
        self.y_1 = (self.y == 0).type(torch.long) - (self.y == 1).type(
            torch.long)
        self.y_2 = (self.y == 1).type(torch.long) - (self.y == 2).type(
            torch.long)
        # Loss weights. First, we compute the inverse of the class frequency,
        # then we normalize the weights so that they sum to 1.
        self.weights = 1 / torch.tensor(
            df['fit'].value_counts().sort_index().values, dtype=torch.float32)
        self.weights /= torch.sum(self.weights)
        self.weights += 2
        self.weights = self.weights[self.y]
        # size bias
        self.bias = torch.tensor(df['size_bias'].values, dtype=torch.float32)
        # user vectors
        self.u_vec = torch.tensor(
            df[['weight', 'height', 'bust_size', 'cup_size']].values,
            dtype=torch.float32)
        # parent item index for each item
        self.pi_idx = torch.tensor(df['item_name'].values, dtype=torch.long)
        # item index for each parent item
        self.pi_idx_inv = torch.tensor(
            df.groupby('item_name').sample(1).sort_index().index.values,
            dtype=torch.long)
        # (initial) parent item vectors
        self.pi_vec = torch.tensor(df.groupby('item_name')[[
            'weight', 'height', 'bust_size', 'cup_size'
        ]].mean().sort_index().values,
                                   dtype=torch.float32)
        # (initial) parent item deviations
        self.pi_dev = torch.ones_like(self.pi_vec)
        # (initial) weights & thresholds
        self.w = w
        self.b_1 = b_1
        self.b_2 = b_2
        # item vectors
        self.i_vec = self.pi_vec[self.pi_idx] + (self.bias *
                                                 self.pi_dev[self.pi_idx].T).T
        # fitness scores
        self.f = (self.i_vec - self.u_vec) @ self.w

    # Projected Gradient Descent 1
    def optimize_weights_thresholds(self, lr=0.01, max_iter=1000):
        for i in range(max_iter + 1):
            # calculate gradients
            sigma_1 = torch.sigmoid(self.y_1 * (self.b_1 - self.f))
            sigma_2 = torch.sigmoid(self.y_2 * (self.b_2 - self.f))
            grad_w = torch.mean(
                ((self.y_1 * (1 - sigma_1) + self.y_2 *
                  (1 - sigma_2))[:, None] *
                 (self.i_vec - self.u_vec)) * self.weights[:, None],
                dim=0)
            grad_b_1 = torch.mean(-self.y_1 * (1 - sigma_1) * self.weights)
            grad_b_2 = torch.mean(-self.y_2 * (1 - sigma_2) * self.weights)
            # update weights and project to non-negative orthant
            self.w -= lr * grad_w
            self.w = torch.max(self.w, torch.zeros_like(self.w))
            # update thresholds
            self.b_1 -= lr * grad_b_1
            self.b_2 -= lr * grad_b_2
            # update fitness scores and loss
            self.f = (self.i_vec - self.u_vec) @ self.w
            self.loss = torch.mean(
                (-torch.log(sigma_1) - torch.log(sigma_2)) * self.weights)
            if i % 100 == 0:
                print(f'Iteration {i}: loss = {self.loss}')

    # Projected Gradient Descent 2
    def optimize_item_vectors(self, lr=0.01, max_iter=1000):
        for i in range(max_iter + 1):
            # calculate gradients
            sigma_1 = torch.sigmoid(self.y_1 * (self.b_1 - self.f))
            sigma_2 = torch.sigmoid(self.y_2 * (self.b_2 - self.f))
            grad_i_vec = (self.y_1 * (1 - sigma_1) + self.y_2 *
                          (1 - sigma_2))[:, None] * self.w
            grad_pi_vec = grad_i_vec[self.pi_idx_inv]
            grad_pi_dev = (self.bias[:, None] * grad_i_vec)[self.pi_idx_inv]
            # update parent item vectors and project deviations to non-negative orthant
            self.pi_vec -= lr * grad_pi_vec
            self.pi_dev -= lr * grad_pi_dev
            self.pi_dev = torch.max(self.pi_dev, torch.zeros_like(self.pi_dev))
            # update item vectors, fitness scores and loss
            self.i_vec = self.pi_vec[
                self.pi_idx] + self.bias[:, None] * self.pi_dev[self.pi_idx]
            self.f = (self.i_vec - self.u_vec) @ self.w
            self.loss = torch.mean(
                (-torch.log(sigma_1) - torch.log(sigma_2)) * self.weights)
            if i % 100 == 0:
                print(f'Iteration {i}: loss = {self.loss}')

    def predict_proba(self):

        prob_2 = torch.sigmoid(self.f - self.b_2)
        prob_1 = torch.sigmoid(self.f - self.b_1) - prob_2
        prob_0 = 1 - prob_1 - prob_2
        return torch.stack([prob_0, prob_1, prob_2], dim=1)

    def predict(self):
        return torch.argmax(self.predict_proba(), dim=1)

    def accuracy(self):
        return torch.mean((self.predict() == self.y).type(torch.float32))

    def f1_score(self):
        from sklearn.metrics import f1_score
        return f1_score(self.y, self.predict(), average='macro')


weights = torch.tensor([1., 1., 1., 1.], dtype=torch.float32)
threshold_1 = torch.tensor(-1, dtype=torch.float32)
threshold_2 = torch.tensor(1, dtype=torch.float32)

optim = ItemVectorOptimizer(train_df,
                            w=weights,
                            b_1=threshold_1,
                            b_2=threshold_2)
for i in range(1, 10):
    print(f'Optimizing weights and thresholds, round {i}')
    optim.optimize_weights_thresholds(lr=1e-3 / i, max_iter=300)
    print(f'Optimizing item vectors, round {i}')
    optim.optimize_item_vectors(lr=1e-5 / i, max_iter=300)
    # print(optim.w, optim.b_1.item(), optim.b_2.item())
    # print((optim.pi_vec - old_vectors).mean().item(),
    #       (optim.pi_dev - old_deviations).mean().item())


Optimizing weights and thresholds, round 1
Iteration 0: loss = 13.73210620880127
Iteration 100: loss = 6.284204006195068
Iteration 200: loss = 3.8980579376220703
Iteration 300: loss = 3.355808973312378
Optimizing item vectors, round 1
Iteration 0: loss = 3.352354049682617
Iteration 100: loss = 3.3522043228149414
Iteration 200: loss = 3.3520548343658447
Iteration 300: loss = 3.3519065380096436
Optimizing weights and thresholds, round 2
Iteration 0: loss = 3.351905107498169
Iteration 100: loss = 3.1982333660125732
Iteration 200: loss = 3.0697100162506104
Iteration 300: loss = 2.958486557006836
Optimizing item vectors, round 2
Iteration 0: loss = 2.957451105117798
Iteration 100: loss = 2.9574224948883057
Iteration 200: loss = 2.9573941230773926
Iteration 300: loss = 2.9573655128479004
Optimizing weights and thresholds, round 3
Iteration 0: loss = 2.9573652744293213
Iteration 100: loss = 2.891524314880371
Iteration 200: loss = 2.8316962718963623
Iteration 300: loss = 2.7775232791900635
Opt

In [279]:
# np.unique(optim.predict(), return_counts=True)
optim.f1_score()
# optim.accuracy()

0.3222197378555875