In [23]:
import numpy as np
import pandas as pd
import scipy
import torch
import torch.nn.functional as F

torch.set_grad_enabled(False)

import os
import sys
from torch import save

sys.path.append(os.path.abspath('../src'))

from utils import fetch_train_data
from models import *
from preprocess import *

# Fetch data from course homepage
train_df = fetch_train_data(path='../data/train_data_sample.json')

# Cleanse data
prep = Preprocessor(pipeline=[
    DropColumns(cols=[
        'user_name', 'review', 'review_summary', 'rating', 'brand', 'category',
        'size', 'size_suffix', 'price', 'rented_for', 'usually_wear', 'age',
        'body_type'
    ]),
    OrdinalEncoder(cols=['fit', 'cup_size', 'item_name']),
    MedianImputer(cols=['weight', 'height', 'bust_size', 'cup_size']),
])
train_df = prep.cleanse(train_df, is_train=True)
train_df.dropna(subset=['fit'], inplace=True)
train_df = prep.fit_transform(train_df)


<class 'preprocess.DropColumns'>
<class 'preprocess.OrdinalEncoder'>
<class 'preprocess.MedianImputer'>


In [24]:
train_df

Unnamed: 0,fit,item_name,size_main,size_scheme,height,weight,bust_size,cup_size
0,1,193,S,letter,170.18,62.595747,34.0,2.0
1,2,2753,2,number,165.10,56.245454,32.0,6.0
2,1,3756,XL,letter,165.10,68.038855,36.0,6.0
3,1,2374,10,number,170.18,68.038855,34.0,3.0
4,2,519,6,number,165.10,72.574779,34.0,9.0
...,...,...,...,...,...,...,...,...
70208,1,2607,XS,letter,160.02,62.595747,32.0,1.0
70209,1,586,8,number,175.26,62.595747,34.0,4.0
70210,2,3335,8,number,165.10,52.163123,30.0,4.0
70211,1,3367,S,letter,165.10,56.699046,34.0,2.0


In [142]:
from scipy.optimize import minimize, LinearConstraint

y_true = torch.tensor(train_df['fit'].values, dtype=torch.long)

user_vectors = torch.tensor(
    train_df[['weight', 'height', 'bust_size', 'cup_size']].values,
    dtype=torch.float32)

parent_item_vectors = train_df.groupby('item_name')[[
    'weight', 'height', 'bust_size', 'cup_size'
]].mean()
parent_item_vectors = torch.tensor(parent_item_vectors.values,
                                   dtype=torch.float32)

parent_item_indices = torch.tensor(train_df['item_name'].values,
                                   dtype=torch.long)


def loss_fn(weights, threshold_1, threshold_2, parent_item_vectors):
    item_vectors = parent_item_vectors[parent_item_indices]
    fitness_scores = (item_vectors - user_vectors) @ weights
    y_1 = (y_true == 0).type(torch.long) - (y_true == 1).type(torch.long)
    y_2 = (y_true == 1).type(torch.long) - (y_true == 2).type(torch.long)
    loss = -torch.sum(
        F.logsigmoid(y_1 * (threshold_1 - fitness_scores)) +
        F.logsigmoid(y_2 * (threshold_2 - fitness_scores)))
    return loss.item()


def objective_1(x):
    w, b_1, b_2 = x[:4], x[4], x[5]
    return loss_fn(w, b_1, b_2, parent_item_vectors)


def objective_2(x):
    pvs = torch.tensor(x.reshape(-1, 4), dtype=torch.float32)
    return loss_fn(weights, threshold_1, threshold_2, pvs)


weights = torch.randn(4)
threshold_1 = torch.tensor(0.5, dtype=torch.float32)
threshold_2 = torch.tensor(1.5, dtype=torch.float32)
new_parent_item_vectors = parent_item_vectors.detach()

for i in range(4):
    # Optimize weights and thresholds
    x0 = torch.cat([weights, torch.tensor([threshold_1, threshold_2])])
    res = minimize(objective_1,
                   x0=x0,
                   bounds=[(0, None)] * 4 + [(None, None)] * 2)
    print(res)
    weights = torch.tensor(res.x[:4], dtype=torch.float32)
    threshold_1 = torch.tensor(res.x[4], dtype=torch.float32)
    threshold_2 = torch.tensor(res.x[5], dtype=torch.float32)

    # Optimize parent item vectors
    res = minimize(objective_2, new_parent_item_vectors.flatten())
    print(res)
    new_parent_item_vectors = torch.tensor(res.x.reshape(-1, 4))


      fun: 69746.97242163622
 hess_inv: <6x6 LbfgsInvHessProduct with dtype=float64>
      jac: array([-8.14907253e-02, -1.32422429e-01,  3.92901711e-02,  1.28254760e+02,
       -2.03726814e-02,  2.61934476e-02])
  message: 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 140
      nit: 16
     njev: 20
   status: 0
  success: True
        x: array([ 0.00993784,  0.00947046,  0.01177534,  0.        , -1.64804851,
        1.45565987])
      fun: 69746.9765625
 hess_inv: array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])
      jac: array([0., 0., 0., ..., 0., 0., 0.])
  message: 'Optimization terminated successfully.'
     nfev: 16357
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([ 65.551414, 166.41096 ,  34.451614, ..., 165.1     ,  34.25    ,
         3.25    ], dtype=float32)
      fun: 69746.972

KeyboardInterrupt: 

In [137]:
# parent_item_vectors
(parent_item_vectors - new_parent_item_vectors).min()

tensor(0.)