In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import json
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

In [327]:
class environment:
    def __init__(self, dataset, time_steps = 288, total_bandwidth = 10000, num_users = 10, CIR = 1000):
        # Load dataset from CSV
        # Convert the 'Date' column to datetime format
        self.dataset = pd.read_csv(dataset)
        self.time_steps = time_steps  # Each row represents a time step
        self.num_users = num_users  # Set to 10 users
        self.total_bandwidth = total_bandwidth  # System capacity (10 Mbps)
        self.CIR = CIR  # Minimum bandwidth guarantee (1 Mbps or 1000 Kbps)

        # Initialize the environment's state and variables
        self.reset()
        self.state_size = 3

    def reset(self):
        """
        Reset the environment to the initial state and perform the initial allocation.
        """
        self.time_step = 0
        self.remaining_bandwidth = self.total_bandwidth
        self.allocated_bandwidth = [0] * self.num_users  # Start with no allocated bandwidth

        # Extract the initial requested bandwidth for all users from the dataset
        self.requested_bandwidth = self._get_requested_bandwidth(self.time_step)

        # Phase 1: Initial allocation
        self.initial_allocation()

        # Phase 2: MIRs set as initial allocation
        self.MIRs = self.allocated_bandwidth[:]

        # State: [MIRs, Requested Bandwidths, Allocated Bandwidths, Time of Day]
        self.state = self._get_state()
        return self.state

    def _get_requested_bandwidth(self, time_step):
        """
        Get the requested bandwidth for all users at the given time step from the dataset.

        Parameters:
        - time_step: The current time step in the simulation.

        Returns:
        - requested_bandwidths: A list of bandwidth requests for the current time step for all users.
        """
        a = time_step
        requested_bandwidths = []
        # Retrieve the requested bandwidth for each user from the dataset
        for _ in range(self.num_users) :
            requested_bandwidths.append(self.dataset.iloc[a][-1])
            a = a + self.time_steps

        return requested_bandwidths

    def _get_state(self):
        """
        Get the current state of the environment.

        Returns:
        - state: A list containing the state for all users.
        """
        state = (self.MIRs, self.requested_bandwidth, self.allocated_bandwidth)
        return state

    def initial_allocation(self):
        """
        Perform the initial allocation strategy (Phase 1) for all users.
        Ensure each user gets at least their CIR (or exactly their request if lower).
        """
        total_allocated = 0

        for i in range(self.num_users):
            if self.requested_bandwidth[i] >= self.CIR:
                allocated_bandwidth = self.CIR  # Allocate CIR if requested >= CIR
            else:
                allocated_bandwidth = self.requested_bandwidth[i]  # Allocate exact request if < CIR

            self.allocated_bandwidth[i] = allocated_bandwidth
            total_allocated += allocated_bandwidth

        self.remaining_bandwidth = self.total_bandwidth - total_allocated  # Update remaining bandwidth

    def step(self, actions):
        """
        Take actions to adjust the MIR for all users.

        Parameters:
        - actions: A list of actions for adjusting MIR for each user.

        Returns:
        - new_state: The updated state after the actions.
        - done: Whether the episode has finished.
        """
        # Adjust MIR based on the actions for each user
        """
        Adjust MIRs and allocate the remaining bandwidth (Phase 2).
        """
        # Phase 2: Adjust MIRs based on RL agent's actions
        for i in range(self.num_users):
            self.MIRs[i] = max(self.CIR, self.MIRs[i] + actions[i])  # Adjust MIR, ensure it's >= CIR

        # Allocate remaining bandwidth based on the updated MIRs
        total_allocated = [self.allocate_bandwidth(i, self.requested_bandwidth[i], self.MIRs[i]) for i in range(self.num_users)]

        # Move to the next time step
        self.time_step += 1
        done = self.time_step >= self.time_steps

        # Get the updated state
        new_state = self._get_state()

        # Calculate rewards based on the requested, allocated, and MIR values
        reward = self.calculate_rewards()

        # Calculate the average allocation ratio
        average_allocation_ratio = self.calculate_allocation_ratio()
        print(f"Time Step: {self.time_step}, Average Allocation Ratio: {average_allocation_ratio}, rewards: {reward}")

        # Update the requested bandwidths for the next step
        if not done:
            self.requested_bandwidth = self._get_requested_bandwidth(self.time_step)

        return new_state, reward, done, total_allocated, self.MIRs, average_allocation_ratio  # Returning


    def allocate_bandwidth(self, user_index, request_bandwidth, MIR):
        """
        Allocate bandwidth for a specific user based on their MIR and requested bandwidth.

        Parameters:
        - user_index: The user index to allocate bandwidth for.
        - request_bandwidth: The requested bandwidth for the user.
        - MIR: The maximum information rate for the user.

        Returns:
        - final_allocation: The final allocated bandwidth for the user.
        """
        # Initial allocated bandwidth from Phase 1
        initial_allocation = self.allocated_bandwidth[user_index]

        # Calculate the remaining requested bandwidth
        remaining_requested = request_bandwidth - initial_allocation

        # Calculate the potential new allocation
        potential_allocation = initial_allocation + min(remaining_requested, MIR - initial_allocation)

        # Ensure that the total allocated bandwidth does not exceed 10,000 Kbps
        total_allocated = sum(self.allocated_bandwidth) + potential_allocation - initial_allocation

        # Update the allocated bandwidth for the user
        self.allocated_bandwidth[user_index] = potential_allocation
        return potential_allocation


    def calculate_allocation_ratio(self):
        """
        Calculate the average allocation ratio across all users.
        """
        total_ratio = 0
        for i in range(self.num_users):
            if self.requested_bandwidth[i] >= self.MIRs[i]:
                ratio = self.MIRs[i] / self.requested_bandwidth[i]
            else:
                ratio = 1.0  # Fully satisfied

            total_ratio += ratio

        # Calculate average ratio
        sum_bandwidth_requested = sum(self.requested_bandwidth)
        average_allocation_ratio = total_ratio / sum_bandwidth_requested
        return average_allocation_ratio


    def calculate_rewards(self):
        R_efficiency = 0.0
        abuse_counters = [0] * len(self.requested_bandwidth)
        penalty_coefficient = -0.5
        total_abuse_score = 0
        min_abuse_duration = 3
        theta = 0.2

        # Calculate R Efficiency and detect abuse
        for i in range(len(self.requested_bandwidth)):
            if self.requested_bandwidth[i] < self.MIRs[i]:
                R_efficiency += 1
            elif self.allocated_bandwidth[i] >= self.MIRs[i]:
                R_efficiency += self.MIRs[i] / self.requested_bandwidth[i]

            # Detect abusive behavior
            if self.requested_bandwidth[i] > self.MIRs[i] * (1 + theta):
                # Increment abuse counter
                abuse_counters[i] += 1
            else:
                # Reset abuse counter
                abuse_counters[i] = 0

            # If abuse counter reaches the minimum duration, count it as an abuse event
            if abuse_counters[i] >= min_abuse_duration:
                total_abuse_score += 1

        # Calculate P Over
        P_over = sum(max(0, requested - allocated) for requested, allocated in zip(self.requested_bandwidth, self.allocated_bandwidth))

        # Normalization for P_abusive
        # Use the number of users as the total possible abuse count to normalize
        possible_abuse_events = len(self.requested_bandwidth)
        P_abusive = penalty_coefficient * (total_abuse_score / possible_abuse_events) if possible_abuse_events > 0 else 0

        # Define weights for penalties
        alpha = 1.0  # Weight for P_over
        beta = 1.0   # Weight for P_abusive

        # Calculate total reward
        reward = R_efficiency - (alpha * P_over) - (beta * P_abusive)

        return reward

In [355]:
class ReinforceLearning :
    def __init__(self, env, nb_episodes, alpha, gamma, epsilon) :
        self.env= env
        self.nb_episodes = nb_episodes
        self.alpha= alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.model = self.PolicyNetwork(self.env)
        self.optimizer = Adam(learning_rate= self.alpha)
        self.rewards_per_episode = []
        self.allocated_bandwidth_all = []  # To store allocated_bandwidth across episodes
        self.mirs_all = []  # To store MIRs across episodes


    class PolicyNetwork(tf.keras.Model) :
        def __init__(self, env) :
            super(ReinforceLearning.PolicyNetwork, self).__init__()  # Proper inheritance
            self.dense1= Dense(units= 24, activation = 'relu')
            self.dense2 = Dense(units= 24, activation= 'relu')
            self.logits = Dense(units=env.num_users)

        def __call__(self, state) :
            state = tf.convert_to_tensor(state, dtype= tf.float32)
            state = tf.expand_dims(state, axis= 0)
            # state = tf.reshape(state, [1, -1]) #Reshape the state
            x = self.dense1(state)
            x = self.dense2(x)
            return self.logits(x)
    """

    """
    def choose_action(self, state) :
        random_number = np.random.rand()
        if random_number < self.epsilon : # if the epsilon is still high
            return np.random.uniform(-100, 100, size= (self.env.num_users, ))
        else :
            # pass the state vector as an input to the model and return the output
            # state = tf.convert_to_tensor(state, dtype= tf.float32)
            logits = self.model(state[0])

            # choose the action with the highest probability
            return logits.numpy().flatten()
    """
    def choose_action(self, state):
        # Extraire les variables de l'état (par exemple, MIR, requested_bandwidth, allocated_bandwidth)
        mir = state[0]  # liste de MIR
        requested_bandwidth = state[1]  # liste des bandwidth demandés
        allocated_bandwidth = state[2]  # liste des bandwidth alloués

        # Taille du vecteur d'actions doit être égale à celle de MIR ou des autres variables d'état
        action_size = len(mir)

        random_number = np.random.rand()

        if random_number < self.epsilon:  # Exploration : choisir une action aléatoire
            return np.random.uniform(-50, 50, size=(action_size,))

        else:  # Exploitation : choisir une action basée sur MIR
            # Utiliser une règle simple pour calculer l'action en fonction de MIR
            # Par exemple : multiplier chaque valeur de MIR par un facteur pour obtenir les actions
            actions = np.array(mir) * 0.1  # Exemple : règle simple où l'action est proportionnelle à MIR

            # Limiter les actions à une certaine plage de valeurs [-100, 100]
            actions = np.clip(actions, -40, 40)

            return actions
    """

    def compute_returns(self, rewards) :
        returns = []
        discounted_sum = 0
        for i, r in enumerate(reversed(rewards)) :
            # discounted_sum *= r + (self.gamma) ** (i)
            discounted_sum = r + self.gamma * discounted_sum
            returns.insert(0, discounted_sum)
        return returns

    def train_agent(self) :
        for episode in range(self.nb_episodes) :
            print(f"\nEpisode : {episode}")
            current_state = self.env.reset()
            rewards = []
            states = []
            allocated_bandwidth = []
            mirs = []
            done = False
            while not done :
                # choose an action with epsilon-greedy approach
                action = self.choose_action(current_state)
                # apply the action in the environment and observ the next state, rawards
                next_state, reward, done, allocated_bandwidth_ep, mir_ep, average_allocation_ratio = self.env.step(action)
                # save the state, action
                states.append(current_state)
                rewards.append(reward)
                allocated_bandwidth.append(allocated_bandwidth_ep)
                mirs.append(mir_ep)
                # update the state
                current_state = next_state

            # Collecting allocated_bandwidth and MIRs for all episodes
            self.allocated_bandwidth_all.append(allocated_bandwidth)
            self.mirs_all.append(mirs)

            # calculer les retours G_t
            returns = self.compute_returns(rewards)
            total_rewards = sum(rewards)
            self.rewards_per_episode.append(total_rewards)

            if episode > 10000 :
                self.epsilon = self.epsilon * 0.995

            if average_allocation_ratio >= 0.9 :
                best_allocated = average_allocation_ratio
                break

        # Once all episodes are done, save to JSON files
        self.save_to_json()

        # update the policy
        with tf.GradientTape() as tape:
            loss = 0
            for i in range(len(states)):
                state = tf.convert_to_tensor(states[i], dtype=tf.float32)
                logits = self.model(state)

                # Calcul des probabilités d'action
                action_proba = tf.nn.softmax(logits)

                # Représentation one-hot de l'action, il faut s'assurer que la forme correspond
                action_one_hot = tf.one_hot(actions[i], self.env.num_users)

                # Ajuster la forme de action_one_hot pour qu'elle corresponde à celle de action_proba
                action_one_hot = tf.expand_dims(action_one_hot, axis=0)  # Ajouter une dimension batch

                # Multiplier les deux tenseurs
                selected_action_proba = tf.reduce_sum(action_proba * action_one_hot)

                # Calcul de la perte (log-likelihood)
                loss -= tf.math.log(selected_action_proba) * returns[i]
        # calculate the gradients
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

    import json

def save_to_json(self):
    # Process allocated_bandwidth and MIRs
    allocated_bandwidth_processed = [[float(x) for sublist in episode for x in sublist]
                                     for episode in self.allocated_bandwidth_all]
    mirs_processed = [[float(x) for sublist in episode for x in sublist]
                      for episode in self.mirs_all]

    # Create a list of dictionaries for allocated bandwidth with column names
    allocated_bandwidth_data = [
        {f"Allocated Bandwidth_{i}": allocated for i, allocated in enumerate(episode)}
        for episode in allocated_bandwidth_processed
    ]

    # Create a list of dictionaries for MIRs with column names
    mirs_data = [
        {f"MIR_{i}": mir for i, mir in enumerate(episode)}
        for episode in mirs_processed
    ]

    # Save to JSON files after training
    with open('allocated_bandwidth_all.json', 'w') as f_ab:
        json.dump(allocated_bandwidth_data, f_ab, indent=4)

    with open('mirs_all.json', 'w') as f_mirs:
        json.dump(mirs_data, f_mirs, indent=4)


In [356]:
env = environment(dataset= '/content/optim_train_set.csv')

  requested_bandwidths.append(self.dataset.iloc[a][-1])


In [357]:
model2 = ReinforceLearning(env, nb_episodes= 27000, alpha= 0.001, gamma= 0.99, epsilon= 1)

In [358]:
actions = model2.choose_action(env.reset())
print(actions)

[-83.51790736 -73.14964932  33.56913542 -50.40422208 -74.69865415
 -55.04932143 -72.04937887  21.414751   -61.92517268 -28.87795699]


  requested_bandwidths.append(self.dataset.iloc[a][-1])


In [359]:
print(env.allocated_bandwidth)

[30.601, 0.0, 12.507, 303.92, 221.766, 280.439, 237.142, 1000, 1000, 1000]


In [360]:
print(env.MIRs)

[30.601, 0.0, 12.507, 303.92, 221.766, 280.439, 237.142, 1000, 1000, 1000]


In [361]:
print(env.requested_bandwidth)

[30.601, 0.0, 12.507, 303.92, 221.766, 280.439, 237.142, 1173.099, 1515.214, 1500.0]


In [None]:
model2.train_agent()


Episode : 0
Time Step: 1, Average Allocation Ratio: 0.001764817571244211, rewards: -1010.8019474694566
Time Step: 2, Average Allocation Ratio: 0.00150362568372944, rewards: -1450.955373047711
Time Step: 3, Average Allocation Ratio: 0.0019592618123050153, rewards: -490.3333333333333
Time Step: 4, Average Allocation Ratio: 0.0014517933473221756, rewards: -1948.7279297177267
Time Step: 5, Average Allocation Ratio: 0.001853627410110429, rewards: -609.6533834318515
Time Step: 6, Average Allocation Ratio: 0.000903326153263951, rewards: -4813.828362952324
Time Step: 7, Average Allocation Ratio: 0.001958691362030286, rewards: -588.3422892994522
Time Step: 8, Average Allocation Ratio: 0.002146182057829809, rewards: -405.80133712990175
Time Step: 9, Average Allocation Ratio: 0.0009481628099595171, rewards: -4525.506324748292
Time Step: 10, Average Allocation Ratio: 0.0019076254919191398, rewards: -637.3752154817224
Time Step: 11, Average Allocation Ratio: 0.0010959077004913673, rewards: -3321.4

  requested_bandwidths.append(self.dataset.iloc[a][-1])


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Time Step: 137, Average Allocation Ratio: 0.000512806102253945, rewards: -7200.353031087338
Time Step: 138, Average Allocation Ratio: 0.00027364881463582633, rewards: -16572.827380160088
Time Step: 139, Average Allocation Ratio: 0.0005779454987323119, rewards: -6481.187858435605
Time Step: 140, Average Allocation Ratio: 0.0003815432431798252, rewards: -10681.43279209482
Time Step: 141, Average Allocation Ratio: 0.0004033780280294006, rewards: -11119.873349187572
Time Step: 142, Average Allocation Ratio: 0.000591199860665452, rewards: -5426.2415863117285
Time Step: 143, Average Allocation Ratio: 0.00023703351816783198, rewards: -18501.821203243337
Time Step: 144, Average Allocation Ratio: 0.00046454000713101225, rewards: -9106.762366934141
Time Step: 145, Average Allocation Ratio: 0.0005926265238224443, rewards: -5624.067001184586
Time Step: 146, Average Allocation Ratio: 0.00038718893695037184, 