In [165]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [166]:
dataset = pd.read_csv("dataset/SampleData.csv")
dataset.head(12)

Unnamed: 0,Ingredient,Product,QMerged_label,Real_Cost,V_0
0,1,1,11,10,0
1,1,2,12,6,0
2,2,1,21,8,0
3,2,2,22,11,0
4,3,1,31,3,0
5,3,2,32,7,0
6,4,1,41,8,0
7,4,2,42,5,0
8,4,3,43,1,0


In [168]:
class DQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNetwork, self).__init__()
        self.layer1 = nn.Linear(state_size, 64)
        self.layer2 = nn.Linear(64,action_size)


    def forward(self, state):
        x = torch.relu(self.layer1(state))
        return self.layer2(x)
    
class MonteCarloDQN:
    def __init__(self, dataset, num_episodes, epsilon, budget, reward, alpha):
        self.state_space = list(set(dataset['Ingredient']))
        self.alpha = alpha
        self.epsilon = epsilon
        self.budget = budget
        self.reward = reward
        self.num_episodes = num_episodes
        self.q_network = DQNetwork(len(self.state_space), len(self.state_space))
        self.target_network = DQNetwork(len(self.state_space), len(self.state_space))
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=alpha)
        self.criterion = nn.MSELoss()

    def run(self, dataset):
        v0 = dataset['V_0']
        dataset['V'] = v0
        output = []
        output1 = []
        output2 = []
        action_in_full = []

        for episode in range(self.num_episodes):
            if episode == 0:
                episode_run = np.random.randint(low=1, high=len(self.state_space) + 1, size=len(self.state_space))
                print(f'Episode Run: {episode_run}')
            else:
                state = torch.Tensor(np.zeros(len(self.state_space)))  # Initial state is all zeros
                episode_run = []
                for t in range(len(self.state_space)):
                    if random.random() < self.epsilon:
                        action = random.randint(0, len(self.state_space) - 1)
                    else:
                        q_values = self.q_network.forward(state.unsqueeze(0))
                        action = torch.argmax(q_values).item()
                    episode_run.append(action)
                    next_state = torch.Tensor(np.zeros(len(self.state_space)))  # Next state is all zeros

                    merged_label = (self.state_space[t] * 10 + action).astype(float)
                    merged_rewards = dataset[dataset['QMerged_label'] == merged_label]['Reward']
                    reward = self.reward if merged_rewards.empty else merged_rewards.iloc[0]

                    # reward calculation
                    if self.budget >= dataset[dataset['QMerged_label'] == merged_label]['Real_Cost'].sum():
                        terminal_reward = 1
                    else:
                        terminal_reward = -1

                    q_values = self.q_network.forward(state.unsqueeze(0))
                    next_q_values = self.target_network.forward(next_state.unsqueeze(0))
                    target = q_values.clone()
                    target[0, action] = reward + self.alpha * torch.max(next_q_values)

                    self.optimizer.zero_grad()
                    loss = self.criterion(q_values, target.detach())
                    loss.backward()
                    self.optimizer.step()

                    state = next_state

            episode_df = pd.DataFrame({'Ingredient': self.state_space, 'Product': episode_run})
            episode_df['Merged_label'] = (episode_df['Ingredient'] * 10 + episode_df['Product']).astype(float)
            dataset['QMerged_label'] = dataset['QMerged_label'].astype(float)
            dataset['Reward'] = self.reward

            episode2 = episode_df.merge(dataset[['QMerged_label', 'Real_Cost', 'Reward']],
                                        left_on='Merged_label', right_on='QMerged_label', how='inner')

            dataset = dataset.drop(columns='Reward')

            if self.budget >= episode2['Real_Cost'].sum():
                return_value = 1
            else:
                return_value = -1

            episode2 = episode2.drop(columns='Reward')
            episode2['Return'] = return_value
            dataset = dataset.merge(episode2[['Merged_label', 'Return']], left_on='QMerged_label',
                                    right_on='Merged_label', how='outer')
            dataset['Return'] = dataset['Return'].fillna(0)

            for v in range(len(dataset)):
                if dataset.iloc[v, 7] == 0:
                    dataset.iloc[v, 5] = dataset.iloc[v, 5]
                else:
                    dataset.iloc[v, 5] = dataset.iloc[v, 5] + self.alpha * (
                            (dataset.iloc[v, 7] / len(self.state_space)) - dataset.iloc[v, 5])

            dataset = dataset.drop(columns='Merged_label')
            dataset = dataset.drop(columns='Return')

            output = np.append(output, dataset.iloc[:, -1].sum())
            output1 = np.append(output1, dataset.iloc[[1, 2, 4, 8], -1].sum())
            output2 = np.append(output2, dataset.iloc[[0, 3, 5, 6, 7], -1].sum())

            action = pd.DataFrame(dataset.groupby('Ingredient')['V'].max())
            action2 = action.merge(dataset, left_on='V', right_on='V', how='inner')
            action3 = action2.groupby('Ingredient')['Product'].apply(
                lambda x: x.iloc[np.random.randint(0, len(x))])

            action_in_full = np.append(action_in_full, action3)
            action_in_full = action_in_full.astype(int)

        return output, output1, output2, action3, dataset, action_in_full


In [173]:
alpha = 0.1
epsilon = 0.1
budget = 100
reward = [0,0,0,0,0,0,0,0,0]
num_episodes = 100

monte_carlo = MonteCarlo(dataset, num_episodes, epsilon, budget, reward, alpha)
# Run Monte Carlo simulation
output, output1, output2, optimal_actions, updated_data, actions_selected_in_time = monte_carlo.run(dataset)

# Print the results
print("Sum of V(a) for all episodes:", output)
print("Sum of V(a) for the cheapest actions:", output1)
print("Sum of V(a) for the rest of the actions:", output2)
print("Optimal actions:", optimal_actions)
print("Updated dataset:", updated_data)
print("Actions selected in each episode:", actions_selected_in_time)

Episode Run: [3 3 2 2]


Sum of V(a) for all episodes: [0.05       0.145      0.2305     0.30745    0.376705   0.4390345
 0.49513105 0.54561795 0.59105615 0.63195054 0.66875548 0.70187993
 0.73169194 0.75852275 0.78267047 0.80440342 0.82396308 0.84156677
 0.8574101  0.87166909 0.88450218 0.89605196 0.90644676 0.93858635
 0.94722772 0.95500495 0.96200445 0.96830401 0.97397361 0.97907625
 0.98366862 0.98780176 0.99152158 0.99486942 0.99788248 1.00059423
 1.00303481 1.00523133 1.0072082  1.00898738 1.01058864 1.01202978
 1.0133268  1.01449412 1.01554471 1.01649024 1.01734121 1.01810709
 1.01879638 1.01941674 1.01997507 1.02047756 1.02092981 1.02133683
 1.02170314 1.02203283 1.02232955 1.04502812 1.04527531 1.04549778
 1.045698   1.07083777 1.09120906 1.09136316 1.09150184 1.09162666
 1.09173899 1.09184009 1.09193108 1.1102141  1.11029019 1.11035867
 1.12680346 1.12686087 1.12691253 1.12695903 1.12700088 1.12703854
 1.12707243 1.12710294 1.16437733 1.16440357 1.16442719 1.16444844
 1.16446758 1.16448479 1.16450029