# OpenVINO™ Device-Placement-Optimization-with-Reinforcement-Learning_final

In [None]:
import openvino as ov
import torch

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)  # For all GPUs

set_seed(42)

core = ov.Core()

In [None]:
def get_available_device(core):
    devices = core.available_devices
    number_of_device = 0
    for device in devices:
        device_name = core.get_property(device, "FULL_DEVICE_NAME")
        print(f"{device}: {device_name}")
        number_of_device += 1
        
    return devices, number_of_device

devices,number_of_device = get_available_device(core)
print(devices)
devices[1] = devices[-1]
print(devices)

Create networkx graph of the given xml model address

In [None]:
import xml.etree.ElementTree as ET
import networkx as nx
import os
from pathlib import Path
import shutil


def xml2graph(xml):
    running_model_fname = f"./Model/"
    
    print(running_model_fname)

    # Save the model to path specified.
    if os.path.isdir(running_model_fname):
        print(f"{running_model_fname} exists already. Deleting the folder")
        shutil.rmtree(running_model_fname)
    os.mkdir(running_model_fname)
    
    ov.save_model(xml, running_model_fname+'resnet50_running.xml')
    
    xml_path=running_model_fname+'resnet50_running.xml'

    tree = ET.parse(xml_path)
    root = tree.getroot()

    G = nx.DiGraph()

    for layer in root.find('layers'):
        layer_id = layer.get('id')
        layer_name = layer.get('name')
        layer_type = layer.get('type')
        data_element = layer.find('data')
        print(layer_name)
        if data_element is not None:
            # Get the 'shape' attribute from the 'data' element
            data_shape = data_element.get('shape')
            print(data_shape)
        G.add_node(layer_id, name=layer_name, type=layer_type, shape=data_element)

    for edge in root.find('edges'):
        from_layer = edge.get('from-layer')
        to_layer = edge.get('to-layer')

        G.add_edge(from_layer, to_layer)
        
    print(G.number_of_nodes())

    return G

Load the resnet 50 model to the ov core and get the computation graph

In [None]:
import openvino as ov
import torch
from torchvision.models import resnet50


# prepare input_data
input_data = torch.rand(1, 3, 224, 224)
xml_file = '/your/resnet50orinceptionv3.xml'

ov_model = core.read_model(model=xml_file)
input_layer = ov_model.input()

###### Option 1: Save to OpenVINO IR:

# save model to OpenVINO IR for later use
# ov.save_model(ov_model, 'resnet50.xml')

###### Option 2: Compile and infer with OpenVINO:

# compile model
compiled_model = ov.compile_model(ov_model,device_name="HETERO:GPU.1,CPU")
# compiled_model = ov.compile_model(ov_model,device_name="CPU")

infer_request = compiled_model.create_infer_request()
infer_request.infer(inputs={input_layer.any_name: input_data})
# run inference
result = infer_request.results
cm = infer_request.get_compiled_model()
# print(cm)
runtime_model = cm.get_runtime_model()
ops = ov_model.get_ordered_ops()
# ops = runtime_model.get_ordered_ops()
# print(len(ops))
# Computation_G = xml2graph(runtime_model)
Computation_G = xml2graph(ov_model)
print(len(ops))
ops_ov = ov_model.get_ordered_ops()


In [None]:
op_type_name = []
op_info = {}
for op in ops_ov:
    op_attributes = op.get_attributes()
    # print(f"{op}: {op_attributes}")
    print(op.get_friendly_name())
    # print(op.get_input_size)
    print(op.get_type_name())
    op_type_name.append(op.get_type_name())
    op_info[op.get_friendly_name()] = op.get_type_name()
    print(f"{op.get_friendly_name()}: {op.get_type_name()}")
unique_list = list(set(op_type_name))
print(unique_list)


In [None]:
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges
# Assuming each tuple in edges is (X, Y) where X's output is consumed by Y
edges = [('A', 'B'), ('B', 'C'), ('C', 'D'), ('E', 'F'), ('F', 'G'), ('H', 'I')]
edges = [('A', 'C'), ('B', 'C'), ('C', 'D'), ('E', 'F'), ('D', 'F'),('G', 'E')]
edges = [('A', 'C'), ('B', 'C'), ('C', 'D'), ('E', 'F'),('G', 'E'), ('F', 'I'), ('D','H'), ('H', 'F')]

G.add_edges_from(edges)

# Function to find and merge nodes based on the heuristic
def merge_operations(graph):
    current_groups = []
    co_location_groups = []
    merged = set()
    
    for node in nx.topological_sort(graph):
        
        # Check if the node has exactly one successor and no other predecessors of successor
        successors = list(graph.successors(node))
        predecessors = list(graph.predecessors(node))
        # print(successors)
        if len(predecessors) == 1 and predecessors[0] in merged:
            for group in co_location_groups:
                if predecessors[0] in group:
                    current_groups = group
                    co_location_groups.remove(group)
        else:
            current_groups = []
            
        # print(current_groups)
        if node not in current_groups and node not in merged:
            current_groups.append(node)
            if len(successors) == 1:
                successor = successors[0]
                predecessors = list(graph.predecessors(successor))
                # print(predecessors)
                if len(predecessors) == 1 and predecessors[0] == node:
                    if successor not in merged:
                        current_groups.append(successor)
                        merged.add(successor)
            # print(current_groups)
            
        co_location_groups.append(current_groups)
        current_groups = []
    
        merged.add(node)
        # print(co_location_groups)
                
    return co_location_groups

# Apply the heuristic
co_location_groups = merge_operations(Computation_G)
# co_location_groups = merge_operations(G)
print("Co-location groups:", co_location_groups)
print("Number of groups: ", len(co_location_groups))


In [None]:
import networkx as nx

# Original graph with some example edges and nodes
G = nx.DiGraph()
edges = [('A', 'C'), ('B', 'C'), ('C', 'D'), ('E', 'F'),('G', 'E'), ('F', 'I'), ('D','H'),('H', 'F')]
G.add_edges_from(edges)

# Example co-location groups identified
# co_location_groups = [['A'], ['B'], ['G', 'E'], ['C', 'D', 'H'], ['F', 'I']]

# Function to merge nodes and create a new graph
def merge_nodes(graph, groups):
    new_graph = nx.DiGraph()
    group_map = {}

    # Map each node to its group identifier (new node)
    for idx, group in enumerate(groups):
        node_name = f"Group_{idx}"
        for node in group:
            group_map[node] = node_name
            if node_name not in new_graph:
                new_graph.add_node(node_name)
                
    # print(group_map)
    
    # Add edges with respect to new group nodes
    for u, v in graph.edges():
        new_u = group_map.get(u, u)
        new_v = group_map.get(v, v)
        if new_u != new_v:
            new_graph.add_edge(new_u, new_v)

    return new_graph, group_map

# Create the new graph
new_G,group_map = merge_nodes(Computation_G, co_location_groups)

# Print new graph nodes and edges
print("Nodes in new graph:")
print(new_G.nodes())
print("Edges in new graph:")
print(new_G.edges())
print(group_map)


In [None]:
import numpy as np
import torch.nn as nn


class EmbeddingModel(nn.Module):
    def __init__(self, unique_list, embedding_size):
        super(EmbeddingModel, self).__init__()
        # Create a ParameterDict to store the parameters
        self.embedding_size = embedding_size
        self.embeddings = nn.ParameterDict({
            op_type: nn.Parameter(torch.randn(embedding_size))
            for op_type in unique_list
        })
        
    def forward(self, op_info,new_G,group_map,ops):
        # Example forward pass that aggregates embeddings based on input types
        # print(self.embeddings['Add'])
        aggregated_embedding = self.pre_process(op_info,new_G,group_map,ops)
        return aggregated_embedding
    
    def pre_process(self,op_info,new_G,group_map,ops):
        op_info_name = op_info.keys()
        # print(op_info_name)
        max_size = 4
        # num_nodes = Computation_G.number_of_nodes()
        num_nodes = new_G.number_of_nodes()
        group_embedding_length = self.embedding_size+num_nodes+4
        group_type_embeddings = {node: torch.zeros(self.embedding_size) for node in new_G.nodes()}  
        group_op_map = {}
        for key, value in group_map.items():
            if value in group_op_map:
                group_op_map[value].append(key)
            else:
                group_op_map[value] = [key]
        # print(group_op_map)
        op_embeddings = torch.zeros([num_nodes,group_embedding_length])
        # print(op_embeddings)

        # one_hot_adjacency = {}
        # for node in Computation_G.nodes(data=True):
        #     one_hot_adjacency[node[1].get("name")] = np.zeros(num_nodes, dtype=int)
        #     # print(type(node[1].get("name")))
        #     for neighbor in Computation_G.neighbors(node[0]):
        #         one_hot_adjacency[node[1].get("name")][int(neighbor) - 1] = 1 
                
        one_hot_adjacency = {}
        for node in new_G.nodes(data=True):
            one_hot_adjacency[node[0]] = np.zeros(num_nodes, dtype=int)
            # print(type(node[1].get("name")))
            for neighbor in new_G.neighbors(node[0]):
                # print(neighbor[5:])
                index_node = int(neighbor[6:])
                # print(node[0])
                one_hot_adjacency[node[0]][index_node] = 1 
                

        for op in ops:
            name = op.get_friendly_name()
            type_embedding = torch.zeros(self.embedding_size)
            # print(name)
            if name in op_info_name:
                type_embedding = self.embeddings[op_info[op.get_friendly_name()]]
                # print(type_embedding)
                try:
                    # print(op_info[op.get_friendly_name()])
                    for i in range(op.get_output_size()):
                        # print(op.get_output_shape(i))
                        shape = list(op.get_output_shape(i))
                        padded_shape = shape + [0] * (max_size - len(shape))
                        # print(padded_shape)
                except Exception as e1:
                    if str(e1) == "get_shape was called on a descriptor::Tensor with dynamic shape":
                        padded_shape = [1,100,100,100]
                        # print(e1)
            elif name[:8] == "Constant":
                # print("Run time model node:")
                # print(name)
                # print(op.get_type_name())
                type_embedding = self.embeddings['Constant']
                for i in range(op.get_output_size()):
                    # print(op.get_output_shape(i))
                    shape = list(op.get_output_shape(i))
                    padded_shape = shape + [0] * (max_size - len(shape))
                    # print(padded_shape)
            else:
                try:
                    shape = list(op.get_output_shape(i))
                    padded_shape = shape + [0] * (max_size - len(shape))
                    # print(padded_shape)
                except:
                    padded_shape = [1,100,100,100]
                
            padded_shape_tensor = torch.tensor(padded_shape, dtype=type_embedding.dtype).detach()
            node_with_feature = None
            for node in Computation_G.nodes(data=True):
                # print(node[1].get("name"))
                # print(name)
                if node[1].get("name") == name:
                    # print(node[1].get("name"))
                    node_with_feature = node[0]
                    # print("Find")
                    break
                
            # if node_with_feature == None:
            #     print("name not detected")
            #     return
            group_name = group_map[node_with_feature]
            # print(group_name)
            
            # if node_with_feature in group_op_map[group_name]:
            group_type_embeddings[group_name] = group_type_embeddings[group_name] + type_embedding
            # print(group_type_embeddings[group_name])
            
            if node_with_feature == group_op_map[group_name][-1]:
                # print(name)
                adj = one_hot_adjacency[group_name]
                # print(type_embedding.shape)
                # print(padded_shape_tensor.shape)
                op_embedding = torch.cat((group_type_embeddings[group_name], padded_shape_tensor), 0)
                # print(op_embedding.shape)
                adj = torch.tensor(adj, dtype=op_embedding.dtype).detach()
                op_embedding = torch.cat((op_embedding, adj), 0)
                # print(op_embedding)
                group_index = int(group_name[6:])
                # print(group_index)
                op_embeddings[group_index] += op_embedding
            # break
        # print(op_embeddings)
        return op_embeddings,group_op_map


In [None]:
embedding_size = 64
em = EmbeddingModel(unique_list, embedding_size)
em(op_info,new_G,group_map,ops)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Linear(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        # print("input ", input)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden
    
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        # print("scores",scores.shape)
        scores = scores.squeeze(2).unsqueeze(1)
        
        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(self.output_size+1, self.hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, st, target_tensor=None, random_sampling=True, temperature=1.5):
        displacement = []
        displacement_log = []
        batch_size = encoder_outputs.size(0)
        # print(encoder_outputs.dtype)
        # print(encoder_outputs.device)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=encoder_outputs.device).fill_(SOS_token)
        # print(decoder_input)

        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []
        MAX_LENGTH = encoder_outputs.shape[1]
        # print(MAX_LENGTH)

        for i in range(MAX_LENGTH):
            # print(decoder_input)

            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            # print(decoder_output)
            attentions.append(attn_weights)
            
            if random_sampling:
                # Random sampling with temperature
                if st < 20:
                    decoder_output = decoder_output / 5
                    
                # print(decoder_output)
                probs = F.softmax(decoder_output, dim=-1).squeeze(0)
                # print(probs)
                decoder_input = torch.multinomial(probs, num_samples=1).detach()
                displacement.append(decoder_input.squeeze(0))
                
            else:
                if target_tensor is not None:
                    # Teacher forcing: Feed the target as the next input
                    decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
                else:
                    # Without teacher forcing: use its own predictions as the next input
                    # print(decoder_output.shape)
                    _, topi = decoder_output.topk(1)
                    # print("topi:",topi)
                    decoder_input = topi.squeeze(-1).detach()  # detach from history as input
                    # print(decoder_input.shape)
            # displacement.append(decoder_input)
            # displacement_log_prob.append(decoder_output[0,0,topi])

        
        displacement = torch.cat(displacement).unsqueeze(0)
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        outputs = F.log_softmax(decoder_outputs, dim=-1)
        # print(outputs)
        attentions = torch.cat(attentions, dim=1)
        # print("outputs.shape", outputs.shape)
        if random_sampling:
            batch_indices = torch.arange(outputs.size(0)).unsqueeze(1).expand_as(displacement)
            # print(torch.arange(outputs.size(0)))
            # print(torch.arange(outputs.size(0)).unsqueeze(1))
            # print(batch_indices)
            time_indices = torch.arange(outputs.size(1)).unsqueeze(0).expand_as(displacement)
            # print(torch.arange(outputs.size(1)))
            # print(torch.arange(outputs.size(1)).unsqueeze(1))
            # print(time_indices)
            displacement_log = outputs[batch_indices, time_indices, displacement]
            # print(displacement_log)
            # print("displacement_log_prob.shape",displacement_log_prob.shape)
        else:
            displacement_log, displacement = torch.max(decoder_outputs, dim=2)

        return decoder_outputs, decoder_hidden, attentions, displacement_log, displacement


    def forward_step(self, input, hidden, encoder_outputs):
        # print(input.dtype)

        embedded =  self.dropout(self.embedding(input))
        # print(embedded.dtype)
        # print(hidden.dtype)
        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device,input_size,hidden_size,output_size):
        super().__init__()
        self.encoder = encoder(input_size, hidden_size)
        self.decoder = decoder(hidden_size,output_size)
        self.device = device
        
    def forward(self, op_embeddings, st,sample=True):
        # Implementation here (omitted for brevity)
        input_op = op_embeddings.unsqueeze(0)
        encoder_outputs, encoder_hidden= self.encoder(input_op)
        # print("encoder_outputs", encoder_outputs)
        # print("encoder_hidden", encoder_hidden)

        _, _, _, displacement_log_prob, displacement = self.decoder(encoder_outputs,encoder_hidden,st)
        # if sample:
        #     pass
        # else:
        #     displacement_log_prob, displacement = torch.max(decoder_outputs, dim=2)
        
        return displacement_log_prob,displacement




In [None]:
import math

def measure_device_placement(displacement,devices,runtime_model,group_op_map,Computation_G):
    
    def calculate_average(values):
        if not values:
            return 0  # Return 0 for an empty list
        total = sum(values)  # Summing up all the elements in the list
        count = len(values)  # Getting the number of elements in the list
        average = total / count  # Calculating the average
        return average
    
    def group_to_operation(displacement,group_op_map):
        operation_displacement = torch.zeros(len(runtime_model.get_ordered_ops()),dtype=int)
        # print(operation_displacement.shape)
        for index, group_displacement in enumerate(displacement):
            group_name = 'Group_' + str(index)
            # print(group_op_map[group_name])
            # print(group_displacement)
            ops = runtime_model.get_ordered_ops()
            # print(group_op_map)
            for op_index in group_op_map[group_name]:
                # print(Computation_G.nodes[op_index]['name'])
                op_name = Computation_G.nodes[op_index]['name']
                op_index = int(op_index)
                for i,op in enumerate(ops):
                    if op.get_friendly_name() == op_name:
                        operation_displacement[i] += group_displacement
                        break
        return operation_displacement
                
            
        
    displacement = displacement.squeeze()
    op_displacement = group_to_operation(displacement,group_op_map)
    Error = False
    latencies = []
    # print(op_displacement.shape)
    try:
        # op_displacement = torch.zeros(len(runtime_model.get_ordered_ops()),dtype=int)
        # op_displacement = op_displacement.squeeze()
        # print(displacement)
        # print(len(runtime_model.get_ordered_ops()))
        for i,op in enumerate(runtime_model.get_ordered_ops()):
            rt_info = op.get_rt_info()
            # print(devices[displacement[i]])
            rt_info["affinity"] = devices[op_displacement[i]]
            # rt_info["affinity"] = "GPU.0"

            # print(rt_info["affinity"] )
            
        input_data = torch.rand(1, 3, 224, 224)
        compiled_model = ov.compile_model(runtime_model,"HETERO:GPU,CPU")
        infer_request = compiled_model.create_infer_request()
        for _ in range(10):
            infer_request.wait()
            infer_request.infer(inputs={input_layer.any_name: input_data})
            infer_request.wait()
            latency = infer_request.latency
            latencies.append(latency)
        latencies = latencies[5:]
        latency = calculate_average(latencies)
    except Exception as e:
        latency = 10000.0
        Error = str(e)
        # print(e)
    latency = math.sqrt(latency)
    # print(f"Is all on CPU: {is_all_zeros} Is all on GPU.0: {is_all_ones} Latency: {latency}")
    return latency, Error


In [None]:
import torch
import numpy as np
import random

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)  # For all GPUs
    np.random.seed(seed)
    random.seed(seed)
    # If using other libraries that use randomness, set their seed here

# Set a seed
set_seed(42)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import json
import datetime

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)  # For all GPUs


class ReinforcementModel(nn.Module):
    def __init__(self, encoder_class, decoder_class, embedding_class, device, input_size, hidden_size, output_size, unique_list):
        super(ReinforcementModel, self).__init__()
        self.embedding_class = embedding_class(unique_list, embedding_size)
        self.seq2seq = Seq2Seq(encoder_class, decoder_class, device, input_size, hidden_size, output_size)
        self.device = device
        
    def forward(self, op_info,new_G,group_map,ops,st):
        op_embeddings, group_op_map = self.embedding_class(op_info,new_G,group_map,ops)
        # print("op_embeddings", op_embeddings)
        # print("group_op_map", group_op_map)

        displacement_log_prob, displacement = self.seq2seq(op_embeddings,st)
        return displacement_log_prob, displacement, group_op_map

def reinforce_train(model, optimizer, n_episodes, op_info, new_G, group_map, ops, ov_devices, ov_model, Computation_G):
    results = []
    best_reward = 10000
    for episode in tqdm(range(n_episodes), desc="Training Episodes"):
        model.zero_grad()
        displacement_log_prob_out,displacement,group_op_map = model(op_info,new_G,group_map,ops,episode)
        # print(displacement_log_prob_out)
        # Compute the reward
        # print(displacement_detached,ov_devices,ov_model,group_op_map, Computation_G)
        reward, Error = measure_device_placement(displacement,ov_devices,ov_model,group_op_map, Computation_G)
        # print(reward)

        # Accumulate losses
        loss = -(displacement_log_prob_out.squeeze() * (reward)).sum()  # Negative log likelihood loss
        # Perform back-propagation
        loss.backward()
        optimizer.step()
        # print(f"Epoch: {episode}, Loss: {loss}, Reward: {reward}")
        results.append({'episode': episode, 'loss': loss.item(), 'reward': reward**2, 'displacement': displacement.squeeze().tolist(), 'Error' : Error})
        
        if reward <99:
            if best_reward > reward**2:
                best_reward = reward**2
                
        print(best_reward)
        with open('training_results.json', 'w') as f:
            json.dump(results, f)
 
# Parameters for the model
input_size = 314  # Number of features in the input
hidden_size = 5*input_size  # Number of features in the hidden state
output_size = 2  # Number of output classes
num_layers = 1  # Number of stacked LSTM layers
SOS_token = output_size
# Set a seed
set_seed(42)
num_operation = Computation_G.number_of_nodes()
model = ReinforcementModel(EncoderRNN, AttnDecoderRNN, EmbeddingModel, "cpu", input_size, hidden_size, output_size, unique_list)
optimizer = optim.Adam(model.parameters(), lr=0.01)

n_episodes = 100
print(op_info)
reinforce_train(model, optimizer, n_episodes, op_info, new_G, group_map ,ops, devices, ov_model, Computation_G)

today = datetime.datetime.now()
date_string = today.strftime('%Y%m%d')

model_parameter_path = f"./baseline_DPO/model_parameters_{date_string}.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': n_episodes
}, model_parameter_path)
