In [5]:
import numpy as np
import random
from collections import defaultdict
import simpy
import time

class CloudEnvironment:
    def __init__(self, num_vms, max_vm_capacity):
        self.num_vms = num_vms
        self.max_vm_capacity = max_vm_capacity
        self.action_space = [0, 1]  # Actions: 0 = Do Nothing, 1 = Provision VM
        self.state_space = range(num_vms)  # States: Number of active VMs
        self.q_table = defaultdict(lambda: [0, 0])  # Q-table {state: [Q-value(action 0), Q-value(action 1)]}
        self.epsilon = 0.1  # Epsilon-greedy exploration parameter
        self.alpha = 0.1  # Learning rate
        self.gamma = 0.9  # Discount factor
        self.env = simpy.Environment()

    def step(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.action_space)  # Explore action space
        else:
            action = np.argmax(self.q_table[state])  # Exploit learned values

        if action == 1:  # Provision VM
            if state < self.num_vms:
                next_state = state + 1
                reward = -1 if next_state == self.num_vms else 0
            else:
                next_state = state
                reward = -1
        else:  # Do Nothing
            next_state = state
            reward = 0

        return next_state, reward

    def train(self, num_episodes):
        for episode in range(num_episodes):
            state = 0  # Initial state
            total_reward = 0
            while state < self.num_vms:
                next_state, reward = self.step(state)
                total_reward += reward
                old_q_value = self.q_table[state][action]
                next_max = np.max(self.q_table[next_state])
                new_q_value = old_q_value + self.alpha * (reward + self.gamma * next_max - old_q_value)
                self.q_table[state][action] = new_q_value
                state = next_state

    def run_simulation(self, num_episodes):
        self.train(num_episodes)
        print("Q-table:", self.q_table)

    num_vms = 5
    max_vm_capacity = 10
    num_episodes = 1000
    env = CloudEnvironment(num_vms, max_vm_capacity)
    env.run_simulation(num_episodes)

Q-table: defaultdict(<function CloudEnvironment.__init__.<locals>.<lambda> at 0x00000135197E3490>, {0: array([-0.00687474, -0.00687474]), 1: array([-0.02898245, -0.02898245]), 2: array([-0.06095559, -0.06095559]), 3: array([-0.15210512, -0.15210512]), 4: array([-0.30442268, -0.30442268]), 5: [0, 0]})


In [4]:
import numpy as np
import random
from collections import defaultdict

class CloudEnvironment:
    def __init__(self, num_vms, max_vm_capacity):
        self.num_vms = num_vms
        self.max_vm_capacity = max_vm_capacity
        self.action_space = [0, 1]  # Actions: 0 = Do Nothing, 1 = Provision VM
        self.state_space = range(num_vms)  # States: Number of active VMs
        self.q_table = defaultdict(lambda: [0, 0])  # Q-table {state: [Q-value(action 0), Q-value(action 1)]}
        self.epsilon = 0.1  # Epsilon-greedy exploration parameter
        self.alpha = 0.1  # Learning rate
        self.gamma = 0.9  # Discount factor

    def step(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.action_space)  # Explore action space
        else:
            action = np.argmax(self.q_table[state])  # Exploit learned values

        if action == 1:  # Provision VM
            if state < self.num_vms:
                next_state = state + 1
                reward = -1 if next_state == self.num_vms else 0
            else:
                next_state = state
                reward = -1
        else:  # Do Nothing
            next_state = state
            reward = 0

        return next_state, reward

    def train(self, num_episodes):
        for episode in range(num_episodes):
            state = 0  # Initial state
            total_reward = 0
            while state < self.num_vms:
                next_state, reward = self.step(state)
                total_reward += reward
                old_q_value = self.q_table[state]
                next_max = np.max(self.q_table[next_state])
                new_q_value = old_q_value + self.alpha * (reward + self.gamma * next_max - old_q_value)
                self.q_table[state] = new_q_value
                state = next_state

    def run_simulation(self, num_episodes):
        self.train(num_episodes)
        print("Q-table:", self.q_table)

if __name__ == "__main__":
    num_vms = 5
    max_vm_capacity = 10
    num_episodes = 1000

    env = CloudEnvironment(num_vms, max_vm_capacity)
    env.run_simulation(num_episodes)


Q-table: defaultdict(<function CloudEnvironment.__init__.<locals>.<lambda> at 0x00000135097AE680>, {0: array([-0.00636383, -0.00636383]), 1: array([-0.02298679, -0.02298679]), 2: array([-0.06373399, -0.06373399]), 3: array([-0.14730003, -0.14730003]), 4: array([-0.49247477, -0.49247477]), 5: [0, 0]})
