In [1]:
import pandas as pd
import numpy as np
import random

import feat_eng as fe
import data_selector_items as dsi
import params_newsvendor as prm

import torch
import torch.nn as nn
import torch.optim as optim

from mip import Model, xsum, minimize, INTEGER, CONTINUOUS, CutType, OptimizationStatus

from qpth.qp import QPFunction

from joblib import Parallel, delayed

from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
is_cuda = False
dev = torch.device('cpu')  
if torch.cuda.is_available():
    is_cuda = True
    dev = torch.device('cuda')  

## Introduction

#### This tutorial is connected to the paper "Predict-and-Optimize: A survey on problem variations and approaches." Here, we aim to show that finding data-driven decisions (from an incomplete but predictable Optimization Problem) using Gradient Descent approaches with Operational loss functions (combined approaches) usually is better than separating your predictions from the Optimization Problem (decoupled approaches).

##### Decoupled approaches: First uses Machine Learning techniques to find predictions. And then uses those predictions to formulate and solve an Optimization Problem.

##### Combined approaches: Uses Machine Learning and Mathematical Programming techniques in each step of the learning to solve the incomplete Optimization Problem 

***
$\mathbf{\text{Table of content}}$<br>

$\mathbf{\text{0. Data}}$<br>

$\mathbf{\text{1. Vanilla Newsvendor Problem}}$<br>

1.1 Decoupled approach

1.2 Combined approach

1.3 Regret evaluation

$\mathbf{\text{2. Adding a Budget constraint}}$<br>

2.1 Decoupled approach
   
2.2 Combined state-of-the-art approach (KKT differentiation)

2.3 Regret evaluation

$\mathbf{\text{3. Adding Integrality constraints (Discrete OP)}}$<br>

3.1 Decoupled approach

3.2 Combined state-of-the-art approach (Gomory cuts + KKT differentiation)

3.3 Regret evaluation

## 0. Data

In [3]:
# Setting the seeds to allow replication
# Changing the seed might require hyperparameter tuning again
# Because it changes the deterministic parameters
seed_number = 0
np.random.seed(seed_number)
torch.manual_seed(seed_number)
random.seed(seed_number)

In [4]:
# Path of data files
path_data = './data/'

# Read historical data
sales = pd.read_csv(path_data + 'sales_train_evaluation.csv')

# Spliting the data in days
start_tr_day = 800 # Avoid changing start date
start_val_day = 1442
start_test_day = 1542
end_day = 1641

# N of items to use
n_items = 8

# All useful items
sku_ids = dsi.select_items(sales, start_tr_day)

# Sample only n_items to use
random.seed(seed_number)
sku_ids = random.sample(sku_ids, n_items)
sku_ids = list(set(sku_ids))
n_items = len(sku_ids)


# Build training and test from historical data
data_train, data_val, data_test, feat, n_items = fe.build_data(
    path_data, sales, sku_ids, 
    start_tr_day, start_val_day, start_test_day, end_day)

data_train.fillna(0, inplace=True)
data_val.fillna(0, inplace=True)
data_test.fillna(0, inplace=True)

dx = len(feat)

# Number of batch_size samples in the SGDs
batch_size = 32 # Number of days, and not samples

# Here we change a bit the test data in order to increase the integrality gap 
# of the optimization problem (didactic purpose). Otherwise the Continuous and 
# Discrete version would have approximately the same results.
data_train['qty'] = data_train['qty']*np.random.normal(1, 0.07)
data_val['qty'] = data_val['qty']*np.random.normal(1, 0.07)
data_test['qty'] = data_test['qty']*np.random.normal(1, 0.07)

scaler = MinMaxScaler()
scaler.fit(data_train[feat])

data_train.loc[:, feat] = scaler.transform(data_train[feat])
data_val.loc[:, feat] = scaler.transform(data_val[feat])
data_test.loc[:, feat] = scaler.transform(data_test[feat])
        
X_train = torch.tensor(np.array(data_train[feat]).astype('double'), requires_grad= True, device=dev)
y_train = torch.tensor(np.array(data_train['qty']).astype('double'), requires_grad= True, device=dev)

X_val = torch.tensor(np.array(data_val[feat]).astype('double'), requires_grad= True, device=dev)
y_val = torch.tensor(np.array(data_val['qty']).astype('double'), requires_grad= True, device=dev)

X_test = torch.tensor(np.array(data_test[feat]).astype('double'), requires_grad= True, device=dev)
y_test = torch.tensor(np.array(data_test['qty']).astype('double'), requires_grad= True, device=dev)

In [5]:
def generate_batches(data):
    batches_idx = []
    n_batches = int(np.floor(data['d'].nunique() / batch_size))
    for i in range(0, n_batches):
        days = data['d'].unique()
        idx = data[data['d'].isin(
            np.random.choice(days, batch_size, replace=False))].index.tolist()
        if len(idx) == n_items*batch_size:
            batches_idx.append(idx)
    return batches_idx

## 1. Vanilla Newsvendor Problem

***
$\mathbf{\text{Notation}}$<br>
***

\begin{align}
&f_{day}: \text{Objective (Cost) Function for the } I^{th} \text{ day of sales}\\
&z: \text{Decision Variable (Orders to make for each item)}\\
&y_I: \text{Unknown but predictable parameter (Demand for each item for the day I)}\\\\
\end{align}

***
$\mathbf{\text{Deterministic Parameters}}$<br>
***

\begin{align}
&c: \text{ Linear constant for transport cost }  \\
&c_s: \text{Linear constant for shortage cost  }  \\
&c_w: \text{Linear constant for excess cost } \\\\
\end{align}

***
$\mathbf{\text{Objective function}}$<br>
***

\begin{align}
f(z,y_I) = c^Tz + c_s^T max(0, y_I-z) + c_w^T max(0, z-y_I)
\end{align}

In [6]:
params_t, params_np = prm.get_params(n_items, is_discrete=False, 
                                     q_factor = 0.01, # Quadratic penalty factor
                                     seed_number=0)

In [7]:
print('Example of deterministic parameters:', params_t['cs'][:2])

Example of deterministic parameters: tensor([147., 109.])


In [8]:
cost_per_item = lambda Z, Y : params_t['c'].to(dev)*Z.to(dev) \
                            + params_t['cs'].to(dev)*torch.max(torch.zeros((n_items)).to(dev),Y.to(dev)-Z.to(dev)) \
                            + params_t['cw'].to(dev)*torch.max(torch.zeros((n_items)).to(dev),Z.to(dev)-Y.to(dev))


def reshape_outcomes(y_pred, y):
    y_pred = torch.reshape(y_pred, (y_pred.shape[0]//n_items, n_items))
    y = torch.reshape(y, (y.shape[0]//n_items, n_items))
    return y_pred, y

def calc_f_por_item(y_pred, y):
    y_pred, y = reshape_outcomes(y_pred, y)
    z_star =  argmin_solver(y_pred)
    f_per_item = cost_per_item(z_star, y)
    return f_per_item

def calc_f_per_day(y_pred, y):
    f_per_item = calc_f_por_item(y_pred, y)
    f = torch.sum(f_per_item, 1)
    return f

### 1.1 Vanilla Newsvendor Problem - Decoupled Approach

In [9]:
#################################################################################
## Model (h: X->Y) constructor ##################################################
#################################################################################

class ANN(nn.Module):
    # Initialize the layers
    def __init__(self, n_feat):
        super().__init__()
        
        self.act1 = nn.ReLU()
        self.act2 = nn.Tanh()
        self.bn = nn.BatchNorm1d(n_feat).double()
        self.linear1 = nn.Linear(n_feat, 15).double()
        self.bn2 = nn.BatchNorm1d(15).double()
        self.linear2 = nn.Linear(15, 1).double()
    
    # Perform the computation
    def forward(self, x):
        x = self.bn(x)
        x = self.linear1(x)
        x = self.act1(x)
        x = self.bn2(x)
        x = self.linear2(x)
        x = self.act1(x)
        return x
    
    
# Decoupled approach (mse loss)
h = ANN(n_feat=dx).to(dev)
opt_h = torch.optim.Adam(h.parameters(), lr=0.005)
mse_loss = nn.MSELoss()


def train_one_epoch(X_train, y_train, loss_function, optimizer, model):
    
    batches_idx = generate_batches(data_train)
    
    for b in batches_idx:
        x_tr = X_train[b]
        y_tr = y_train[b]

        optimizer.zero_grad()
        preds = model(x_tr)

        train_loss = loss_function(preds.reshape(-1), y_tr.reshape(-1))
        train_loss.backward()
        optimizer.step()
        
        
def validate_one_epoch(X, y, model, data):
    
    batches_idx = generate_batches(data)
    f_sum = 0
    
    with torch.no_grad():
        for b in batches_idx:
            x_ = X[b]
            y_ = y[b]
    
            f_ = cost_fn(model(x_).reshape(-1), y_.reshape(-1))  
            f_sum = f_sum + f_

    return f_sum/len(batches_idx)

In [10]:
n_epochs = 50

rmse_costs_sep = []

for i in range(0, n_epochs):
    train_one_epoch(X_train, y_train, mse_loss, opt_h, h)
    
    with torch.no_grad():
        train_rmse_sep = mse_loss(h(X_train).reshape(-1), y_train.reshape(-1))  
        val_rmse_sep = mse_loss(h(X_val).reshape(-1), y_val.reshape(-1))

        rmse_costs_sep.append(val_rmse_sep.data.item())

        print(
              'DECOUPLED: Train: ', 
               'mse:', round(train_rmse_sep.data.item(), 2), 
               '\tVal: ', 
               'mse:', round(val_rmse_sep.data.item(), 2))

DECOUPLED: Train:  mse: 28.27 	Val:  mse: 22.53
DECOUPLED: Train:  mse: 21.82 	Val:  mse: 16.71
DECOUPLED: Train:  mse: 19.23 	Val:  mse: 14.72
DECOUPLED: Train:  mse: 18.6 	Val:  mse: 13.98
DECOUPLED: Train:  mse: 18.46 	Val:  mse: 13.49
DECOUPLED: Train:  mse: 18.39 	Val:  mse: 13.68
DECOUPLED: Train:  mse: 18.4 	Val:  mse: 13.82
DECOUPLED: Train:  mse: 18.41 	Val:  mse: 13.72
DECOUPLED: Train:  mse: 18.37 	Val:  mse: 13.82
DECOUPLED: Train:  mse: 18.34 	Val:  mse: 13.94
DECOUPLED: Train:  mse: 18.21 	Val:  mse: 13.74
DECOUPLED: Train:  mse: 18.16 	Val:  mse: 14.04
DECOUPLED: Train:  mse: 18.04 	Val:  mse: 13.86
DECOUPLED: Train:  mse: 17.92 	Val:  mse: 14.04
DECOUPLED: Train:  mse: 17.84 	Val:  mse: 14.02
DECOUPLED: Train:  mse: 17.75 	Val:  mse: 14.01
DECOUPLED: Train:  mse: 17.66 	Val:  mse: 14.12
DECOUPLED: Train:  mse: 17.68 	Val:  mse: 14.29
DECOUPLED: Train:  mse: 17.51 	Val:  mse: 14.15
DECOUPLED: Train:  mse: 17.38 	Val:  mse: 14.17
DECOUPLED: Train:  mse: 17.33 	Val:  mse: 

***
$\mathbf{\text{Predicting all test outcomes (demand of sales)}}$<br>
***

\begin{align}
\hat{Y} = h(X)
\end{align}

##### Note that $Y = [y_I]_1^{N_{days}}$

In [11]:
y_pred = h(X_test)

***
$\mathbf{\text{Defining the solver as a function of the prediction (without constraints)}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = argmin_z \text{ } f(z,\hat{y_I}) \\ subject \quad to \quad z\geq0
\end{align}

##### In this case (1.1 and 1.2), we consider an analytical solution to calculate the argmin that allows simple backpropagation via pytorch. This is possible because theproblem is still simple, the only constraint is that z>=0

***
$\mathbf{\text{The analytical solution to find the argmin in this case is trivial:}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = \hat{y_I} \\ 
\end{align}

This is because we set $c<c_s$ for all items

In [12]:
# Analytical solution to find the argmin
# This function allows autograd (backpropagation)
def argmin_solver(y_pred):
    z_star = torch.where(params_t['c'].to(dev)>=params_t['cs'].to(dev), 
                         torch.tensor(0.0).double().to(dev), y_pred).double()
    return z_star

***
$\mathbf{\text{Compute the final cost function (average through days) based on made decisions}}$<br>
***

\begin{align}
\frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(\hat{y_I}),y_I) 
\end{align}

In [13]:
def cost_fn(y_pred, y):
    f = calc_f_per_day(y_pred, y)
    f_total = torch.mean(f)
    return f_total

In [14]:
pred_cost_11 = cost_fn(y_pred, y_test)
print('Final cost on Test Data:', round(pred_cost_11.item(), 2))

Final cost on Test Data: 5661.63


### 1.2 Vanilla Newsvendor Problem - Combined Approach

In [15]:
# Combined approach (mse loss)
hc = ANN(n_feat=dx).to(dev)
opt_hc = torch.optim.Adam(hc.parameters(), lr=0.002)

In [16]:
n_epochs = 50

obj_costs_com = []

for i in range(0, n_epochs):
    train_one_epoch(X_train, y_train, cost_fn, opt_hc, hc)
    
    with torch.no_grad():
        train_cost_com = cost_fn(hc(X_train).reshape(-1), y_train.reshape(-1))  
        val_cost_com = cost_fn(hc(X_val).reshape(-1), y_val.reshape(-1))

        obj_costs_com.append(val_cost_com.data.item())

        print(
                  'COMBINED: Train: ', 
                   'f:', round(train_cost_com.data.item(), 2), 
                   '\tVal: ', 
                   'f:', round(val_cost_com.data.item(), 2))

COMBINED: Train:  f: 4818.88 	Val:  f: 4548.05
COMBINED: Train:  f: 4683.01 	Val:  f: 4412.58
COMBINED: Train:  f: 4598.59 	Val:  f: 4328.21
COMBINED: Train:  f: 4566.53 	Val:  f: 4304.44
COMBINED: Train:  f: 4549.55 	Val:  f: 4306.39
COMBINED: Train:  f: 4536.95 	Val:  f: 4288.6
COMBINED: Train:  f: 4534.74 	Val:  f: 4280.92
COMBINED: Train:  f: 4530.56 	Val:  f: 4290.34
COMBINED: Train:  f: 4520.0 	Val:  f: 4276.65
COMBINED: Train:  f: 4510.87 	Val:  f: 4272.82
COMBINED: Train:  f: 4503.97 	Val:  f: 4281.15
COMBINED: Train:  f: 4497.78 	Val:  f: 4294.7
COMBINED: Train:  f: 4489.77 	Val:  f: 4279.57
COMBINED: Train:  f: 4487.85 	Val:  f: 4270.8
COMBINED: Train:  f: 4478.18 	Val:  f: 4269.71
COMBINED: Train:  f: 4473.4 	Val:  f: 4264.56
COMBINED: Train:  f: 4464.9 	Val:  f: 4263.07
COMBINED: Train:  f: 4457.41 	Val:  f: 4254.96
COMBINED: Train:  f: 4453.57 	Val:  f: 4267.65
COMBINED: Train:  f: 4447.29 	Val:  f: 4267.6
COMBINED: Train:  f: 4435.07 	Val:  f: 4246.11
COMBINED: Train:  f:

***
$\mathbf{\text{Predicting all test outcomes (demand of sales)}}$<br>
***

\begin{align}
\hat{Y} = h(X)
\end{align}

Note that $Y = [y_I]_1^{N_{days}}$

In [17]:
y_pred = hc(X_test)

***
$\mathbf{\text{Compute the final cost function (average through days) based on made decisions}}$<br>
***

\begin{align}
PredictedCost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(\hat{y_I}),y_I) 
\end{align}

In [18]:
pred_cost_12 = cost_fn(y_pred, y_test)
print('Final cost on Test Data:', round(pred_cost_12.item(), 2))

Final cost on Test Data: 4846.89


***
$\mathbf{\text{Compute the final cost function (average through days) based on BEST decisions}}$<br>
***

\begin{align}
Optimal Cost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(y_I),y_I) 
\end{align}

In [19]:
best1_cost = cost_fn(y_test, y_test)
print('Best cost on Test Data:', round(best1_cost.item(), 2))

Best cost on Test Data: 2613.45


***
$\mathbf{\text{Compute the cumulative regrets from 1.1 and 1.2}}$<br>
***

\begin{align}
Regret = PredictedCost - OptimalCost
\end{align}

In [20]:
regret11 = pred_cost_11 - best1_cost
regret12 = pred_cost_12 - best1_cost

print('Cumulative regret: \n \
        1.1 -> {regret11} \n \
        1.2 -> {regret12} \
'.format(
    regret11=int(regret11),
    regret12=int(regret12)
))

Cumulative regret: 
         1.1 -> 3048 
         1.2 -> 2233 


***
$\mathbf{\text{Compute the normalized regret from 1.1 and 1.2}}$<br>
***

\begin{align}
Normalized Regret = \frac{Regret}{Optimal Cost} 
\end{align}

In [21]:
cr11 = regret11/best1_cost
cr12 = regret12/best1_cost

print('Normalized regret: \n \
        1.1 -> {cr11} \n \
        1.2 -> {cr12} \
'.format(
    cr11=cr11,
    cr12=cr12
))

Normalized regret: 
         1.1 -> 1.1663428964028562 
         1.2 -> 0.854596406328829 


## 2 Adding a Budget constraint

##### This class formulates an Linear Programming (relaxed version of the MILP). We explain how to transform this problem in a Linear Program formulation in Section 6 in our article. 

In [22]:
class SolveNewsvendor():
    def __init__(self, params_np):
        super(SolveNewsvendor, self).__init__()
            
        n_items = len(params_np['c'])
        self.n_items = n_items    
            
        # Numpy parameters for Gomory cuts
        self.cost_vector = np.hstack((params_np['c'], params_np['cs'], params_np['cw']))
        self.price_ineq_np = np.hstack((params_np['pr'], np.zeros(2*n_items)) )
        self.size_ineq_np = np.hstack((params_np['si'], np.zeros(2*n_items)))
        
        
        
    def milp_formulation(self, y_I):
        """
        Formulate the MILP to be used in "solve_milp"
        We can solve the continuous version (LP) just passing relax = True in "solve_milp"
        The MILP is formulated based on an outcome chunk y_I.
        """
        m = Model("milp")
        m.verbose = 0

        n_items_range = range(n_items)
        n_variables = range(3*n_items)
        
        z = ([m.add_var(var_type=INTEGER) for i in range(0, n_items)] 
             + [m.add_var(var_type=CONTINUOUS) for i in range(n_items, 3*n_items)])

        # linear objective function
        m.objective = minimize(xsum(self.cost_vector[i] * z[i] for i in n_variables))

        # all variables greater than zero
        for i in n_variables:
            m += -z[i] <= 0
        # constraints on shortage variables
        for i in range(0, n_items):
            m += -z[i] - z[i + n_items] <= -y_I[i]
        # constraints on excess variables
        for i in range(0, n_items):
            m += z[i] - z[i + 2*n_items] <= y_I[i]
        # constraints on budget
        m += xsum(self.price_ineq_np[i] * z[i] for i in n_variables) <= params_np['B']
        # constraints on size
        #m += xsum(self.size_ineq_np[i] * z[i] for i in n_variables) <= params_np['S']
        
        return m
    
    
    def solve_milp(self, y, relax):
        """
        The function solves the problem for each y_I in a batch
        and return the argmin result for the whole batch
        """
        y = y.detach().numpy()
        n_batches, n_items = y.shape
        
        argmin = []
        
        for j in range(0, n_batches):
            y_I = y[j]
            m = self.milp_formulation(y_I)
            status = m.optimize(relax = relax)
        
            argmin_sample = []
            for v in m.vars:
                argmin_sample.append(v.x)
            argmin.append(argmin_sample[:n_items])
             
        return torch.tensor(argmin)

In [23]:
# Setting the Budget (sum of prices) and Capacity (sum of sizes) upperbound (constraints)
params_np['B'] = (params_np['pr'].sum()*y_test.median().item()*0.6).astype('float32')
#params_np['S'] = (params_np['si'].sum()*y_test.median().item()*0.6).astype('float32')

params_t['B'] = torch.tensor([params_np['B']])
#params_t['S'] = torch.tensor([params_np['S']])

print('Example of price constant and budget constraints:', params_np['pr'][:2], round(params_t['B'].item(), 2))

# Construct the solver
newsvendor_solve = SolveNewsvendor(params_np)

Example of price constant and budget constraints: [298. 273.] 5885.36


### 2.1 Adding a Budget and Capacity constraints - Decoupled approach

In [24]:
def argmin_solver(y_pred):
    z_star = newsvendor_solve.solve_milp(y_pred, relax=True) # Solve the continuous version
    return z_star

***
$\mathbf{\text{Defining the solver as a function of the prediction (with constraints)}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = & argmin_z \text{ } f(z,\hat{y_I}) \\
\text{Subject to } & \begin{cases}
pr^Tz \leq B \\
si^Tz \leq S \\
z \geq 0
\end{cases}
\end{align}

##### Now the function "argmin_solver" is replaced to a linear programming (relaxation of a MILP). This solver does not allow backpropagation in principle, so we will solve first in a decoupled manner.

##### The predictions are already done in 1.1 using MSE loss, so we leverage the same predictions

***
$\mathbf{\text{Compute the final cost function (average through days) based on made decisions}}$<br>
***

\begin{align}
PredictedCost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(\hat{y_I}),y_I) 
\end{align}

In [25]:
y_pred = h(X_test)
pred_cost_21 = cost_fn(y_pred, y_test)
print('Final cost on Test Data:', round(pred_cost_21.item(), 2))

Final cost on Test Data: 5066.86


### 2.2 Adding a Budget and Capacity constraints - Combined state-of-the-art approach

##### This class formulates a Quadratic Programming using the library QPTH from Amos and Kolter (2017). We formulate the LP now as a QP (addinga penalty quadratic term) proposed by Wilder et al. (2019)

In [26]:
class SolveNewsvendorWithKKT():
    def __init__(self, params_t):
        super(SolveNewsvendorWithKKT, self).__init__()
            
        n_items = len(params_t['c'])
        self.n_items = n_items    
            
        # Torch parameters for KKT         
        ident = torch.eye(n_items)
        ident3 = torch.eye(3*n_items)
        zeros_matrix = torch.zeros((n_items, n_items))
        zeros_array = torch.zeros(n_items)
        ones_array = torch.ones(n_items)
             
        self.Q = torch.diag(torch.hstack((params_t['q'], params_t['qs'], params_t['qw']))).to(dev)
        self.lin = torch.hstack((params_t['c'], params_t['cs'], params_t['cw'])).to(dev)
             
        shortage_ineq = torch.hstack((-ident, -ident, zeros_matrix))
        excess_ineq = torch.hstack((ident, zeros_matrix, -ident))
        price_ineq = torch.hstack((params_t['pr'], zeros_array, zeros_array))
        #size_ineq = torch.hstack((params_t['si'], zeros_array, zeros_array))
        positive_ineq = -ident3
        
        #self.ineqs = torch.vstack((shortage_ineq, excess_ineq, price_ineq, size_ineq, positive_ineq)).to(dev)
        self.ineqs = torch.vstack((shortage_ineq, excess_ineq, price_ineq, positive_ineq)).to(dev)

        self.uncert_bound = torch.hstack((-ones_array, ones_array)).to(dev)
        #self.determ_bound = torch.tensor([params_t['B'], params_t['S']]) 
        self.determ_bound = torch.tensor([params_t['B']]) 
        self.determ_bound = torch.hstack((self.determ_bound, 
                                          torch.zeros(n_items), 
                                          torch.zeros(n_items), 
                                          torch.zeros(n_items))).to(dev)
        
        
        
    def forward(self, y):
        """
        Applies the qpth solver for all batches and allows backpropagation.
        Formulation based on Priya L. Donti, Brandon Amos, J. Zico Kolter (2017).
        Note: The quadratic terms (Q) are used as auxiliar terms only to allow the backpropagation through the 
        qpth library from Amos and Kolter. 
        We will set them as a small percentage of the linear terms (Wilder, Ewing, Dilkina, Tambe, 2019)
        """
        
        n_batches, n_items = y.size()
        
        assert self.n_items == n_items 

        Q = self.Q
        Q = Q.expand(n_batches, Q.size(0), Q.size(1))

        ineqs = torch.unsqueeze(self.ineqs, dim=0)
        ineqs = ineqs.expand(n_batches, ineqs.shape[1], ineqs.shape[2])       

        uncert_bound = (self.uncert_bound*torch.hstack((y, y)))
        determ_bound = self.determ_bound.unsqueeze(dim=0).expand(
            n_batches, self.determ_bound.shape[0])
        bound = torch.hstack((uncert_bound, determ_bound))     
        
        e = torch.DoubleTensor().to(dev)
        
        argmin = QPFunction(verbose=-1)\
            (Q.double(), self.lin.double(), ineqs.double(), 
             bound.double(), e, e).double()
            
        return argmin[:,:n_items]

In [27]:
# Construct the solver
newsvendor_solve_kkt = SolveNewsvendorWithKKT(params_t)

In [28]:
# Combined approach (mse loss)
hc = ANN(n_feat=dx)
opt_hc = torch.optim.Adam(hc.parameters(), lr=0.002)

***
$\mathbf{\text{Defining the solver as a function of the prediction (with constraints)}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = & argmin_z \text{ } f(z,\hat{y_I}) \\
\text{Subject to } & \begin{cases}
pr^Tz \leq B \\
si^Tz \leq S \\
z \geq 0
\end{cases}
\end{align}

##### Now the function "argmin_solver" is replaced to a quadratic programming that allows backpropagation, and we use the method proposed by Priya Donti et al. (2017) to solve the problem

In [29]:
def argmin_solver(y_pred):
    z_star = newsvendor_solve_kkt.forward(y_pred)
    return z_star

In [30]:
n_epochs = 20

obj_costs_com = []

for i in range(0, n_epochs):
    train_one_epoch(X_train, y_train, cost_fn, opt_hc, hc)
    
    f_train = validate_one_epoch(X_train, y_train, hc, data_train)
    f_val = validate_one_epoch(X_val, y_val, hc, data_val)

    print(
              'COMBINED: Train: ', 
               'f:', round(f_train.data.item(), 2), 
               '\tVal: ', 
               'f:', round(f_val.data.item(), 2))

torch.linalg.eig returns complex tensors of dtype cfloat or cdouble rather than real tensors mimicking complex tensors.
L, _ = torch.eig(A)
should be replaced with
L_complex = torch.linalg.eigvals(A)
and
L, V = torch.eig(A, eigenvectors=True)
should be replaced with
L_complex, V_complex = torch.linalg.eig(A) (Triggered internally at  ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2910.)
  e, _ = torch.eig(Q[i])


COMBINED: Train:  f: 4917.06 	Val:  f: 4630.46
COMBINED: Train:  f: 4771.35 	Val:  f: 4361.93
COMBINED: Train:  f: 4562.19 	Val:  f: 4416.09
COMBINED: Train:  f: 4681.36 	Val:  f: 4261.32
COMBINED: Train:  f: 4608.07 	Val:  f: 4201.15
COMBINED: Train:  f: 4623.01 	Val:  f: 4189.35
COMBINED: Train:  f: 4542.1 	Val:  f: 4326.95
COMBINED: Train:  f: 4558.02 	Val:  f: 4106.14
COMBINED: Train:  f: 4623.71 	Val:  f: 4081.27
COMBINED: Train:  f: 4521.93 	Val:  f: 4159.04
COMBINED: Train:  f: 4510.92 	Val:  f: 4409.02
COMBINED: Train:  f: 4495.84 	Val:  f: 4356.24
COMBINED: Train:  f: 4495.28 	Val:  f: 4300.87
COMBINED: Train:  f: 4551.45 	Val:  f: 4273.75
COMBINED: Train:  f: 4378.54 	Val:  f: 4413.51
COMBINED: Train:  f: 4388.11 	Val:  f: 4323.83
COMBINED: Train:  f: 4494.41 	Val:  f: 4384.78
COMBINED: Train:  f: 4435.58 	Val:  f: 4241.33
COMBINED: Train:  f: 4438.42 	Val:  f: 4285.16
COMBINED: Train:  f: 4521.33 	Val:  f: 4212.64


***
$\mathbf{\text{Predicting all test outcomes (demand of sales)}}$<br>
***

\begin{align}
\hat{Y} = hc(X)
\end{align}

##### Note that $Y = [y_I]_1^{N_{days}}$

##### Also, note that now $hc$ was learned based on the cost function through the KKT differentiation, not on the MSE loss

In [31]:
y_pred = hc(X_test)

In [32]:
def argmin_solver(y_pred):
    z_star = newsvendor_solve.solve_milp(y_pred, relax=True)
    return z_star

***
$\mathbf{\text{Compute the final cost function (average through days) based on made decisions}}$<br>
***

\begin{align}
PredictedCost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(\hat{y_I}),y_I) 
\end{align}

In [33]:
pred_cost_22 = cost_fn(y_pred, y_test)
print('Final cost on Test Data:', round(pred_cost_22.item(), 2))

Final cost on Test Data: 4858.59


***
$\mathbf{\text{Compute the final cost function (average through days) based on BEST decisions}}$<br>
***

\begin{align}
OptimalCost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(y_I),y_I) 
\end{align}

In [34]:
best2_cost = cost_fn(y_test, y_test)
print('Best cost on Test Data:', round(best2_cost.item(), 2))

Best cost on Test Data: 3537.47


***
$\mathbf{\text{Compute the cumulative regrets from 2.1 and 2.2}}$<br>
***

\begin{align}
Regret = PredictedCost - OptimalCost
\end{align}

In [35]:
regret21 = pred_cost_21 - best2_cost
regret22 = pred_cost_22 - best2_cost

print('Cumulative regret: \n \
        2.1 -> {regret21} \n \
        2.2 -> {regret22} \
'.format(
    regret21=int(regret21),
    regret22=int(regret22)
))

Cumulative regret: 
         2.1 -> 1529 
         2.2 -> 1321 


***
$\mathbf{\text{Compute the normalized regret from 2.1 and 2.2}}$<br>
***

\begin{align}
Normalized Regret = \frac{Regret}{Optimal Cost} 
\end{align}

In [36]:
cr21 = regret21/best2_cost
cr22 = regret22/best2_cost

print('Normalized regret: \n \
        2.1 -> {cr21} \n \
        2.2 -> {cr22} \
'.format(
    cr21=cr21,
    cr22=cr22
))

Normalized regret: 
         2.1 -> 0.4323409038805651 
         2.2 -> 0.37346500397491594 


## 3 Adding integrality constraints

### 3.1 Adding integrality constraints - Decoupled Approach

#### By adding integrality constraints, we aim (at the final moment) to solve the same problem with the parameter relax = False  

***
$\mathbf{\text{Defining the solver as a function of the prediction (with constraints)}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = & argmin_z \text{ } f(z,\hat{y_I}) \\
\text{Subject to } & \begin{cases}
pr^Tz \leq B \\
si^Tz \leq S \\
z \geq 0 \\
z \in \mathbb{Z}
\end{cases}
\end{align}

In [37]:
def argmin_solver(y_pred):
    z_star = newsvendor_solve.solve_milp(y_pred, relax=False)
    return z_star

##### The predictions are already done in 1.1 using MSE loss, so we leverage the same predictions

***
$\mathbf{\text{Compute the final cost function (average through days) based on made decisions}}$<br>
***

\begin{align}
PredictedCost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(\hat{y_I}),y_I) 
\end{align}

In [38]:
y_pred = h(X_test)
pred_cost_31 = cost_fn(y_pred, y_test)
print('Final cost on Test Data:', round(pred_cost_31.item(), 2))

Final cost on Test Data: 5018.95


### 3.2 Adding integrality constraints - Combined Approach

#### For each step of the gradient descent we will solve the OP with relax = True after applying the Gomory cuts Aaron Ferber et al. (2019)

In [39]:
# Number of maximum cuts for each iteration of the learning
N_try_cuts = 100

In [40]:
class SolveNewsvendorWithGomoryAndKKT():
    def __init__(self, params_t, params_np):
        super(SolveNewsvendorWithGomoryAndKKT, self).__init__()
            
        n_items = len(params_np['c'])
        self.n_items = n_items    
            
        # Numpy parameters for Gomory cuts
        self.cost_vector = np.hstack((params_np['c'], params_np['cs'], params_np['cw']))
        self.price_ineq_np = np.hstack((params_np['pr'], np.zeros(2*n_items)) )
        self.size_ineq_np = np.hstack((params_np['si'], np.zeros(2*n_items)))
        
        # Torch parameters for KKT         
        ident = torch.eye(n_items)
        ident3 = torch.eye(3*n_items)
        zeros_matrix = torch.zeros((n_items, n_items))
        zeros_array = torch.zeros(n_items)
        ones_array = torch.ones(n_items)
             
        self.Q = torch.diag(torch.hstack((params_t['q'], params_t['qs'], params_t['qw']))).to(dev)
        self.lin = torch.hstack((params_t['c'], params_t['cs'], params_t['cw'])).to(dev)
             
        shortage_ineq = torch.hstack((-ident, -ident, zeros_matrix))
        excess_ineq = torch.hstack((ident, zeros_matrix, -ident))
        price_ineq = torch.hstack((params_t['pr'], zeros_array, zeros_array))
        #size_ineq = torch.hstack((params_t['si'], zeros_array, zeros_array))
        positive_ineq = -ident3
        
        #self.ineqs = torch.vstack((shortage_ineq, excess_ineq, price_ineq, size_ineq, positive_ineq)).to(dev)
        self.ineqs = torch.vstack((shortage_ineq, excess_ineq, price_ineq, positive_ineq)).to(dev)

        self.uncert_bound = torch.hstack((-ones_array, ones_array)).to(dev)
        #self.determ_bound = torch.tensor([params_t['B'], params_t['S']]) 
        self.determ_bound = torch.tensor([params_t['B']]) 
        self.determ_bound = torch.hstack((self.determ_bound, 
                                          torch.zeros(n_items), 
                                          torch.zeros(n_items), 
                                          torch.zeros(n_items))).to(dev)

        
    def milp_formulation(self, y_I):
        """
        Formulate the MILP to be used in "solve_milp" or in "forward_gomory_cut"
        The MILP is formulated based on an outcome chunk y_I.
        """
        m = Model("milp")
        m.verbose = 0

        n_items_range = range(n_items)
        n_variables = range(3*n_items)
        
        z = ([m.add_var(var_type=INTEGER) for i in range(0, n_items)] 
             + [m.add_var(var_type=CONTINUOUS) for i in range(n_items, 3*n_items)])

        # linear objective function
        m.objective = minimize(xsum(self.cost_vector[i] * z[i] for i in n_variables))

        # all variables greater than zero
        for i in n_variables:
            m += -z[i] <= 0
        # constraints on shortage variables
        for i in range(0, n_items):
            m += -z[i] - z[i + n_items] <= -y_I[i]
        # constraints on excess variables
        for i in range(0, n_items):
            m += z[i] - z[i + 2*n_items] <= y_I[i]
        # constraints on budget
        m += xsum(self.price_ineq_np[i] * z[i] for i in n_variables) <= params_np['B']
        # constraints on size
        #m += xsum(self.size_ineq_np[i] * z[i] for i in n_variables) <= params_np['S']
        
        return m
    
    
    def one_sample_cuts(self, y_I):
        """
        This function first formulates the MILP and then uses the MILP formulation to 
        compute and return some Gomory cuts (just for 1 instance of I (day))
        """
        m = self.milp_formulation(y_I)  
        n_cuts = 0

        for i in range(N_try_cuts):
            status = m.optimize(relax = True)
            cp = m.generate_cuts([CutType.GOMORY])

            if cp.cuts:
                if n_cuts + len(cp.cuts) <= N_try_cuts:
                    n_cuts = n_cuts + len(cp.cuts)
                    m += cp
                else:
                    break

            else:
                break


        matrix_ineq_cut = np.array([])
        array_ineq_bound = np.array([])
        for i in range(-n_cuts, 0):
            cut = m.constrs[-n_cuts:][i].expr.expr
            varslist = list(m.vars)
            dict_vars_init = dict(zip(varslist, len(varslist)*[0]))
            ineq_cut = {**dict_vars_init, **cut}
            values_ineq_cut = np.array(list(ineq_cut.values()))
            array_ineq_bound = np.append(array_ineq_bound, -m.constrs[i].expr.const)
            if i == -n_cuts:
                matrix_ineq_cut = values_ineq_cut
            else:
                matrix_ineq_cut = np.vstack((matrix_ineq_cut, values_ineq_cut))

        # Adding zeros to standardize ineq size
        if n_cuts == 0:
            matrix_ineq_cut = np.zeros((N_try_cuts-n_cuts, 3*n_items))
            array_ineq_bound = np.ones(N_try_cuts-n_cuts)

        elif n_cuts < N_try_cuts:
            zrsm = np.zeros((N_try_cuts-n_cuts, 3*n_items))
            zrsv = np.ones(N_try_cuts-n_cuts)
            matrix_ineq_cut = np.vstack((matrix_ineq_cut, zrsm))
            array_ineq_bound = np.hstack((array_ineq_bound, zrsv))

        else:
            pass

        array_ineq_bound = array_ineq_bound.reshape(-1)

        matrix_ineq_cut = np.expand_dims(matrix_ineq_cut, 0)
        array_ineq_bound = np.expand_dims(array_ineq_bound, 0)

        cut = np.dstack((matrix_ineq_cut, array_ineq_bound))

        return cut

    
    def forward_gomory_cut(self, y):
        """
        This function applies N_try_cuts in the relaxed version of the MILP
        formulation from "milp_formulation". The function returns the matrix "mic"
        and the vector "aic" for each "y_I". Those outputs represent the cuts in a
        form of inequalities mic*z<=aic. 
        Idea of Gomory cuts from Ferber, Wilder, Dilkina and Tambe (2019).
        """

        y = y.detach().numpy()
        n_batches, n_items = y.shape

        mic = np.array([])
        aib = np.array([])

        cuts = Parallel(n_jobs=-1, backend='multiprocessing')(delayed(self.one_sample_cuts)(y_I) for y_I in y)   
        cuts = np.vstack(cuts)
        mic = cuts[:,:,:-1]
        aic = cuts[:,:,-1]

        return torch.tensor(mic), torch.tensor(aic)
        
       
    def forward(self, y):
        """
        Applies the qpth solver for all batches and allows backpropagation.
        Formulation based on Priya L. Donti, Brandon Amos, J. Zico Kolter (2017).
        Note: The quadratic terms (Q) are used as auxiliar terms only to allow the backpropagation through the 
        qpth library from Amos and Kolter. 
        We will set them as a small percentage of the linear terms (Wilder, Ewing, Dilkina, Tambe, 2019)
        """
        
        n_batches, n_items = y.size()
        
        assert self.n_items == n_items 

        Q = self.Q
        Q = Q.expand(n_batches, Q.size(0), Q.size(1))

        ineqs = torch.unsqueeze(self.ineqs, dim=0)
        ineqs = ineqs.expand(n_batches, ineqs.shape[1], ineqs.shape[2])       

        uncert_bound = (self.uncert_bound*torch.hstack((y, y)))
        determ_bound = self.determ_bound.unsqueeze(dim=0).expand(
            n_batches, self.determ_bound.shape[0])
        bound = torch.hstack((uncert_bound, determ_bound))

        # Adding the Gomory cuts as constraints
        m, b = self.forward_gomory_cut(y)
        ineqs = torch.hstack((ineqs, m)).to(dev)
        bound = torch.hstack((bound, b)).to(dev)     
        
        e = torch.DoubleTensor().to(dev)
        
        argmin = QPFunction(verbose=-1)\
            (Q.double(), self.lin.double(), ineqs.double(), 
             bound.double(), e, e).double()
            
        return argmin[:,:n_items]

***
$\mathbf{\text{Defining the solver as a function of the prediction (with constraints)}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = & argmin_z \text{ } f(z,\hat{y_I}) \\
\text{Subject to } & \begin{cases}
pr^Tz \leq B \\
si^Tz \leq S \\
z \geq 0
\end{cases}
\end{align}

##### Note that here we still use the relaxed version of the MILP, but considering the gomory cuts. The difference is that the integrality gap will be lower and hopefully the continuous results will becloser to theoptimal integer decision.

In [41]:
def argmin_solver(y_pred):
    z_star = newsvendor_solve_gomory_kkt.forward(y_pred)
    return z_star

In [42]:
# Combined approach (mse loss)
hc = ANN(n_feat=dx).to(dev)
opt_hc = torch.optim.Adam(hc.parameters(), lr=0.001)

In [43]:
# Construct the solver
newsvendor_solve_gomory_kkt = SolveNewsvendorWithGomoryAndKKT(params_t, params_np)

In [None]:
n_epochs = 20

obj_costs_com = []

for i in range(0, n_epochs):
    train_one_epoch(X_train, y_train, cost_fn, opt_hc, hc)
    
    f_train = validate_one_epoch(X_train, y_train, hc, data_train)
    f_val = validate_one_epoch(X_val, y_val, hc, data_val)

    print(
              'COMBINED: Train: ', 
               'f:', round(f_train.data.item(), 2), 
               '\tVal: ', 
               'f:', round(f_val.data.item(), 2))

COMBINED: Train:  f: 5028.69 	Val:  f: 4509.14
COMBINED: Train:  f: 4910.3 	Val:  f: 4641.87
COMBINED: Train:  f: 4810.45 	Val:  f: 4415.31
COMBINED: Train:  f: 4817.66 	Val:  f: 4573.06
COMBINED: Train:  f: 4684.58 	Val:  f: 4358.79
COMBINED: Train:  f: 4601.95 	Val:  f: 4327.43
COMBINED: Train:  f: 4726.61 	Val:  f: 4283.14
COMBINED: Train:  f: 4737.81 	Val:  f: 4526.96
COMBINED: Train:  f: 4625.46 	Val:  f: 4474.13
COMBINED: Train:  f: 4617.76 	Val:  f: 4117.05
COMBINED: Train:  f: 4690.02 	Val:  f: 4469.99
COMBINED: Train:  f: 4663.72 	Val:  f: 4118.3
COMBINED: Train:  f: 4620.8 	Val:  f: 4286.49
COMBINED: Train:  f: 4635.11 	Val:  f: 4476.6
COMBINED: Train:  f: 4704.66 	Val:  f: 4224.83
COMBINED: Train:  f: 4533.04 	Val:  f: 4082.09


In [None]:
y_pred = hc(X_test)

***
$\mathbf{\text{Defining the solver as a function of the prediction (with constraints)}}$<br>
***

\begin{align}
z^*(\hat{y_I}) = & argmin_z \text{ } f(z,\hat{y_I}) \\
\text{Subject to } & \begin{cases}
pr^Tz \leq B \\
si^Tz \leq S \\
z \geq 0 \\
z \in \mathbb{Z}
\end{cases}
\end{align}

In [None]:
def argmin_solver(y_pred):
    z_star = newsvendor_solve.solve_milp(y_pred, relax=False)
    return z_star

##### After training hc we need to compute the final decisions. And now the final decisions need to be integers (relax = False)

***
$\mathbf{\text{Compute the final cost function (average through days) based on made decisions}}$<br>
***

\begin{align}
PredictedCost = \frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(\hat{y_I}),y_I) 
\end{align}

In [None]:
pred_cost_32 = cost_fn(y_pred, y_test)
print('Final cost on Test Data:', round(pred_cost_32.item(), 2))

***
$\mathbf{\text{Compute the final cost function (average through days) based on BEST decisions}}$<br>
***

\begin{align}
OptimalCost=\frac{1}{N_{days}} \sum_{I=1}^{N_{days}} f(z^*(y_I),y_I) 
\end{align}

In [None]:
best3_cost = cost_fn(y_test, y_test)
print('Best cost on Test Data:', round(best3_cost.item(), 2))

***
$\mathbf{\text{Compute the cumulative regrets from 3.1 and 3.2}}$<br>
***

\begin{align}
Regret = PredictedCost-OptimalCost
\end{align}

In [None]:
regret31 = pred_cost_31 - best3_cost
regret32 = pred_cost_32 - best3_cost

print('Cumulative regret: \n \
        3.1 -> {regret31} \n \
        3.2 -> {regret32}  \
'.format(
    regret31=int(regret31),
    regret32=int(regret32),
))

***
$\mathbf{\text{Compute the normalized regret from 3.1 and 3.2}}$<br>
***

\begin{align}
Normalized Regret = \frac{Regret}{Optimal Cost} 
\end{align}

In [None]:
cr31 = regret31/best3_cost
cr32 = regret32/best3_cost

print('Normalized regret: \n \
        3.1 -> {cr31} \n \
        3.2 -> {cr32}  \
'.format(
    cr31=cr31,
    cr32=cr32
))

## Conclusion and ideas to practitioners:

#### 1. Perks of Decoupled Approaches -> Time saver
Decoupled approaches run much faster than combined approaches. Notice that we truncate the data to work only with 50 random items. This is because for state-of-the-art combined approaches we need to use Mathematical Programming techniques iteratively, and it takes too much time mainly if the Optimization Problem is too complex. 

#### 2. Perks of Combined Approaches -> Money saver
Comparing the final results, we normally have (we tested with many different parameters and we consistently had) lower regret using combined approaches. This is expected since the loss function is more aligned with the final goal.

#### 3. Mixing Decoupled and Combined approaches
We suggest the practitioner to try first updating the weight of the ANN using the MSE loss (1.1 training part). And then start the training using the KKT method (2.2 training part) loading the previous ways. This idea (not with KKT differentiation, but with other methods) was proposed by Mandi et al. (2020)

#### 4. Early Stopping
One can design the algorithm to stop based on the final operational cost (even though using decoupled approach). Notice that doing that requires to solve an Optimization Problem iteratively as well, but it does not require to solve the OP in every iteration. This technique will probably improve mainly the decoupled approaches results.