# Functions

In [11]:
import math
import numpy as np

bandwidth=64 * 10**6  #B/ms
freq=250 * 10**3 #cycles/ms
P1=138
P2=32

def im2col_OS_latency(a,b,c,P1,P2): #a: H^2, b:K^2*C_in, c:C_out
	return math.ceil(a/P1)*math.ceil(c/P2)*b/freq + (a*b+b*c)/bandwidth

def im2col_WS_latency(a,b,c,P1,P2):
	return math.ceil(b/P1)*math.ceil(c/P2)*a/freq + (a*b+b*c)/bandwidth

def kn2row_OS_latency(a,b,c,k,P1,P2): #a: H^2, b:C_in, c:C_out
	return math.ceil(a/P1)*math.ceil(c/P2)*b*k**2/freq + (a*b+b*c)/bandwidth

def kn2row_WS_latency(a,b,c,k,P1,P2):
	return math.ceil(b/P1)*math.ceil(c/P2)*a*k**2/freq + (a*b+b*c)/bandwidth

def wino_OS_latency(a,b,c,m,r,P1,P2): #a: H^2, b:C_in, c:C_out
	return math.ceil((a/m**2)/P1)*math.ceil(c/P2)*b*(m+r-1)**2/freq

def wino_WS_latency(a,b,c,m,r,P1,P2):
	return math.ceil(b/P1)*math.ceil(c/P2)*(a/m**2)*(m+r-1)**2/freq

def inception_latency_im2col(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,P1,P2,df):
	assert(C_out==num_1by1+num_3by3+num_5by5+pool_proj)
	conv_blocks=[]
	num_ops=0
	# From left to right
	# 1 by 1 CONV block
	a=o*o
	b=C_in
	c=num_1by1
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(im2col_OS_latency(a,b,c,P1,P2))
	if (df=="WS"):
		conv_blocks.append(im2col_WS_latency(a,b,c,P1,P2))
	# 1 by 1 CONV block before 3x3
	a=o*o
	b=C_in
	c=reduce_3by3
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(im2col_OS_latency(a,b,c,P1,P2))
	if (df=="WS"):
		conv_blocks.append(im2col_WS_latency(a,b,c,P1,P2))
	# 3 by 3 CONV block
	a=o*o
	b=3*3*reduce_3by3
	c=num_3by3
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(im2col_OS_latency(a,b,c,P1,P2))
	if (df=="WS"):
		conv_blocks.append(im2col_WS_latency(a,b,c,P1,P2))
	# 1 by 1 CONV block before 5x5
	a=o*o
	b=C_in
	c=reduce_5by5
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(im2col_OS_latency(a,b,c,P1,P2))
	if (df=="WS"):
		conv_blocks.append(im2col_WS_latency(a,b,c,P1,P2))
	# 5 by 5 CONV block
	a=o*o
	b=5*5*reduce_5by5
	c=num_5by5
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(im2col_OS_latency(a,b,c,P1,P2))
	if (df=="WS"):
		conv_blocks.append(im2col_WS_latency(a,b,c,P1,P2))
	# 1 by 1 CONV block after Pool
	a=o*o
	b=C_in
	c=pool_proj
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(im2col_OS_latency(a,b,c,P1,P2))
	if (df=="WS"):
		conv_blocks.append(im2col_WS_latency(a,b,c,P1,P2))
	# print("num_ops:",num_ops/1e6)
	return conv_blocks


def inception_latency_kn2row(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,P1,P2,df):
	assert(C_out==num_1by1+num_3by3+num_5by5+pool_proj)
	conv_blocks=[]
	num_ops=0
	# From left to right
	# 1 by 1 CONV block
	a=o*o
	b=C_in
	c=num_1by1
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(kn2row_OS_latency(a,b,c,1,P1,P2))
	if (df=="WS"):
		conv_blocks.append(kn2row_WS_latency(a,b,c,1,P1,P2))
	# 1 by 1 CONV block before 3x3
	a=o*o
	b=C_in
	c=reduce_3by3
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(kn2row_OS_latency(a,b,c,1,P1,P2))
	if (df=="WS"):
		conv_blocks.append(kn2row_WS_latency(a,b,c,1,P1,P2))
	# 3 by 3 CONV block
	a=o*o
	b=reduce_3by3
	c=num_3by3
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(kn2row_OS_latency(a,b,c,3,P1,P2))
	if (df=="WS"):
		conv_blocks.append(kn2row_WS_latency(a,b,c,3,P1,P2))
	# 1 by 1 CONV block before 5x5
	a=o*o
	b=C_in
	c=reduce_5by5
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(kn2row_OS_latency(a,b,c,1,P1,P2))
	if (df=="WS"):
		conv_blocks.append(kn2row_WS_latency(a,b,c,1,P1,P2))
	# 5 by 5 CONV block
	a=o*o
	b=reduce_5by5
	c=num_5by5
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(kn2row_OS_latency(a,b,c,5,P1,P2))
	if (df=="WS"):
		conv_blocks.append(kn2row_WS_latency(a,b,c,5,P1,P2))
	# 1 by 1 CONV block after Pool
	a=o*o
	b=C_in
	c=pool_proj
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(kn2row_OS_latency(a,b,c,1,P1,P2))
	if (df=="WS"):
		conv_blocks.append(kn2row_WS_latency(a,b,c,1,P1,P2))
	# print("num_ops:",num_ops/1e6)
	return conv_blocks


def inception_latency_wino(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,P1,P2,df,m,r):
	assert(C_out==num_1by1+num_3by3+num_5by5+pool_proj)
	conv_blocks=[]
	num_ops=0
	# From left to right
	# 1 by 1 CONV block
	a=o*o
	b=C_in
	c=num_1by1
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(wino_OS_latency(a,b,c,1,1,P1,P2)/freq+ (a*b+b*c)/bandwidth) #cannot use wino! this is same as im2col
	if (df=="WS"):
		conv_blocks.append(wino_WS_latency(a,b,c,1,1,P1,P2)/freq+ (a*b+b*c)/bandwidth)
	# 1 by 1 CONV block before 3x3
	a=o*o
	b=C_in
	c=reduce_3by3
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(wino_OS_latency(a,b,c,1,1,P1,P2)/freq+ (a*b+b*c)/bandwidth)
	if (df=="WS"):
		conv_blocks.append(wino_WS_latency(a,b,c,1,1,P1,P2)/freq+ (a*b+b*c)/bandwidth)
	# 3 by 3 CONV block
	a=o*o
	b=reduce_3by3
	c=num_3by3
	num_ops+=a*b*c
	#b/P1)*c/P2)*(a/m**2)*(m+r-1)**2
	if (df=="OS"):
		conv_blocks.append((wino_OS_latency(a,b,c,m,r,P1,P2)*(3/r)**2+(3/r)**2*(m+r-1)**2*max(m,r))/freq + (a*b/m**2+b*c)*(m+r-1)**2/bandwidth)
	if (df=="WS"):
		conv_blocks.append((wino_WS_latency(a,b,c,m,r,P1,P2)*(3/r)**2+(3/r)**2*(m+r-1)**2*max(m,r))/freq + (a*b/m**2+b*c)*(m+r-1)**2/bandwidth)
	# 1 by 1 CONV block before 5x5
	a=o*o
	b=C_in
	c=reduce_5by5
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(wino_OS_latency(a,b,c,1,1,P1,P2)/freq+ (a*b+b*c)/bandwidth)
	if (df=="WS"):
		conv_blocks.append(wino_WS_latency(a,b,c,1,1,P1,P2)/freq+ (a*b+b*c)/bandwidth)
	# 5 by 5 CONV block
	a=o*o
	b=reduce_5by5
	c=num_5by5
	num_ops+=a*b*c
	#(((H/m)**2*Cin +Cin*Cout)*(m+r-1)**2*(K/r)**2/bandwidth)
	if (df=="OS"):
		conv_blocks.append((wino_OS_latency(a,b,c,m,r,P1,P2)*(5/r)**2+(5/r)**2*(m+r-1)**2*max(m,r))/freq + (a*b/m**2+b*c)*(m+r-1)**2*(5/r)**2/bandwidth)
	if (df=="WS"):
		conv_blocks.append((wino_WS_latency(a,b,c,m,r,P1,P2)*(5/r)**2+(5/r)**2*(m+r-1)**2*max(m,r))/freq + (a*b/m**2+b*c)*(m+r-1)**2*(5/r)**2/bandwidth)
	# 1 by 1 CONV block after Pool
	a=o*o
	b=C_in
	c=pool_proj
	num_ops+=a*b*c
	if (df=="OS"):
		conv_blocks.append(wino_OS_latency(a,b,c,1,1,P1,P2)/freq + (a*b+b*c)/bandwidth)
	if (df=="WS"):
		conv_blocks.append(wino_WS_latency(a,b,c,1,1,P1,P2)/freq + (a*b+b*c)/bandwidth)
	# print("num_ops:",num_ops/1e6)
	return conv_blocks


def transition_cost(Cout_last,H_next,Cin_next,K_next,S_next,Cout_next, algo_last, algo_next, m,r):
    if (algo_last=="im2col"):
        if (algo_next=="im2col"):
            return ((H_next-K_next)/S_next+1)**2 * K_next**2 * Cout_last / bandwidth
        if (algo_next=="kn2row"):
            return H_next**2 * Cout_last / bandwidth
        if (algo_next=="wino"):
            return (H_next*(m+r-1)/m)**2 * Cout_last / (math.floor(64/m)*bandwidth/64)
            
    if (algo_last=="kn2row"):
        if (algo_next=="im2col"):
            return ((H_next-K_next)/S_next+1)**2 * K_next**2 * Cout_last / bandwidth
        if (algo_next=="kn2row"):
            return H_next**2 * Cout_last / bandwidth      
        if (algo_next=="wino"):
            return (H_next*(m+r-1)/m)**2 * Cout_last / (math.floor(64/m)*bandwidth/64)
        
    if (algo_last=="wino"):
        if (algo_next=="im2col"):
            return H_next**2 * Cout_last / (math.floor(64/m)*bandwidth/64)
        if (algo_next=="kn2row"):
            return H_next**2 * Cout_last / (math.ceil(64/K_next**2)*bandwidth/64)
        if (algo_next=="wino"):
            return (H_next*(m+r-1)/m) * Cout_last / (math.floor(64/m)*bandwidth/64)

def inception_T_3a(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,m,r):
	transition_matrices=[[],[],[],[],[],[],[],[],[],[]] 
	algos=["im2col","kn2row","wino"]
	for i in range(len(algos)):
		# edge 1: after 1x1 CONV
		transition_matrices[1].append(transition_cost(num_1by1,o,num_1by1,1,1,0,algos[i], "im2col", m,r))
		# edge 4: after 3x3 CONV
		transition_matrices[4].append(transition_cost(num_3by3,o,num_3by3,1,1,0,algos[i], "im2col", m,r))
		# edge 7: after 5x5 CONV
		transition_matrices[7].append(transition_cost(num_5by5,o,num_5by5,1,1,0,algos[i], "im2col", m,r))
		# edge 9: after 1x1 CONV(Pool)
		transition_matrices[9].append(transition_cost(pool_proj,o,pool_proj,1,1,0,algos[i], "im2col", m,r))
		for j in range(len(algos)):
			# edge 0: before 1x1 CONV
			transition_matrices[0].append(transition_cost(num_1by1,o,num_1by1,1,1,0,algos[i], algos[j], m,r))
			# edge 1: after 1x1 CONV ...

			# edge 2: before 1x1 CONV for 3x3
			transition_matrices[2].append(transition_cost(reduce_3by3,o,reduce_3by3,1,1,0,algos[i], algos[j], m,r))
			# edge 3: before 3x3 CONV
			transition_matrices[3].append(transition_cost(num_3by3,o,num_3by3,3,1,0,algos[i], algos[j], m,r))
			# edge 4: after 3x3 CONV ...

			
			# edge 5: before 1x1 CONV for 5x5
			transition_matrices[5].append(transition_cost(reduce_5by5,o,reduce_5by5,1,1,0,algos[i], algos[j], m,r))
			# edge 6: before 5x5 CONV
			transition_matrices[6].append(transition_cost(num_5by5,o,num_5by5,5,1,0,algos[i], algos[j], m,r))
			# edge 7: after 5x5 CONV ...

			# edge 8: before 1x1 CONV(+Pool)
			transition_matrices[8].append(transition_cost(pool_proj,o,pool_proj,1,1,0,algos[i], algos[j], m,r))
			# edge 9: after 1x1 CONV(Pool) ...
	return transition_matrices


def inception_T_others(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,m,r):
	transition_matrices=[[],[],[],[],[],[],[],[],[],[]] 
	algos=["im2col","kn2row","wino"]
	for i in range(len(algos)):
		# edge 0: before 1x1 CONV
		transition_matrices[0].append(transition_cost(num_1by1,o,num_1by1,1,1,0,algos[i], "im2col", m,r))
		# edge 1: after 1x1 CONV
		transition_matrices[1].append(transition_cost(num_1by1,o,num_1by1,1,1,0,algos[i], "im2col", m,r))
		# edge 2: before 1x1 CONV for 3x3
		transition_matrices[2].append(transition_cost(reduce_3by3,o,reduce_3by3,1,1,0,algos[i], "im2col", m,r))
		# edge 5: before 1x1 CONV for 5x5
		transition_matrices[5].append(transition_cost(reduce_5by5,o,reduce_5by5,1,1,0,algos[i], "im2col", m,r))
		# edge 4: after 3x3 CONV
		transition_matrices[4].append(transition_cost(num_3by3,o,num_3by3,1,1,0,algos[i], "im2col", m,r))
		# edge 7: after 5x5 CONV
		transition_matrices[7].append(transition_cost(num_5by5,o,num_5by5,1,1,0,algos[i], "im2col", m,r))
		# edge 8: before 1x1 CONV(+Pool)
		transition_matrices[8].append(transition_cost(pool_proj,o,pool_proj,1,1,0,algos[i], "im2col", m,r))
		# edge 9: after 1x1 CONV(Pool)
		transition_matrices[9].append(transition_cost(pool_proj,o,pool_proj,1,1,0,algos[i], "im2col", m,r))
		for j in range(len(algos)):
			# edge 3: before 3x3 CONV
			transition_matrices[3].append(transition_cost(num_3by3,o,num_3by3,3,1,0,algos[i], algos[j], m,r))
			# edge 6: before 5x5 CONV
			transition_matrices[6].append(transition_cost(num_5by5,o,num_5by5,5,1,0,algos[i], algos[j], m,r))
	return transition_matrices


# num_layer: used to label node
def single_conv_node_cost_write(file_x,cost_vectors_per_conv,num_layer):
	file_x.write("%d \n" %len(cost_vectors_per_conv))
	for i in range(len(cost_vectors_per_conv)):
		file_x.write('%f ' %cost_vectors_per_conv[i])
	file_x.write("\n")

# cost cost_vectors_per_inception: list of lists, l[x][y] x:#algos(3), y:#CONV blocks(6)
# 1st inception layer: num_layer=0
# num_offset=3 because 3 CONV layers before inception
def inception_node_cost_write(file_x,cost_vectors_per_inception,num_layer,num_offset):
	# file_x.write("%d \n" %num_layer)
	for i in range(6):
		file_x.write("%d \n" %(len(cost_vectors_per_inception)))
		for j in range(len(cost_vectors_per_inception)):
			file_x.write('%f ' %cost_vectors_per_inception[j][i])
		file_x.write("\n")

# named 3a because of len vector, for others change some 3,3, to 1,3 (start node)
def inception_3a_T_cost_write(file_x,cost_vectors_per_inception,num_layer,num_offset):
	start=num_layer*7+num_offset
	end=(num_layer+1)*7+num_offset
	node_rep_vector=[[start,start+1],[start+1,end], [start,start+2],[start+2,start+3],[start+3,end],
	[start,start+4],[start+4,start+5],[start+5,end], [start,start+6],[start+6,end]]
	node_len_vector=[[3,3],[3,1], [3,3],[3,3],[3,1],
	[3,3],[3,3],[3,1], [3,3],[3,1]]
	for i in range(10):
		file_x.write("%d %d \n" %(node_rep_vector[i][0],node_rep_vector[i][1]))
		file_x.write("%d %d \n" %(node_len_vector[i][0],node_len_vector[i][1]))
		for j in range(len(cost_vectors_per_inception[i])):
			file_x.write('%f ' %cost_vectors_per_inception[i][j])
		file_x.write("\n")





# Cost and Transitoin Vectors Construction

In [12]:

print("CONV1 params:", 1*7*7*64)
print("CONV1 operations:", 112*112* 1*7*7 *64/1e6)

print("CONV2 params:", 1*1*64*64 + 3*3*64*192)
print("CONV2 operations:", (56*56* 1*1*64 *64 + 56*56* 64*3*3 *192)/1e6)

# inception_latency(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,P1,P2)
cost_vector=[[],[],[],[],[],[],[],[],[],[],[],[]] #12 layers * 6 algo-dfs * 6 CONV blocks

# print("inception 3a:",inception_latency_im2col(192,28,256,64,128,32,32,96,16,P1,P2,"OS"))
# print("inception 3b:",inception_latency_im2col(256,28,480,128,192,96,64,128,32,P1,P2,"OS"))

# print("inception 4a:",inception_latency_im2col(480,14,512,192,208,48,64,96,16,P1,P2,"OS"))
# print("inception 4b:",inception_latency_im2col(512,14,512,160,224,64,64,112,24,P1,P2,"OS"))
# print("inception 4c:",inception_latency_im2col(512,14,512,128,256,64,64,128,24,P1,P2,"OS"))
# print("inception 4d:",inception_latency_im2col(512,14,528,112,288,64,64,144,32,P1,P2,"OS"))
# print("inception 4e:",inception_latency_im2col(528,14,832,256,320,128,128,160,32,P1,P2,"OS"))

# print("inception 5a:",inception_latency_im2col(832,7,832,256,320,128,128,160,32,P1,P2,"OS"))
# print("inception 5b:",inception_latency_im2col(832,7,1024,384,384,128,128,192,48,P1,P2,"OS"))
m=2
r=3
# dfs=["OS","WS"]
# for item in dfs:

	#=========================im2col============================
cost_vector[0].append(np.minimum(im2col_OS_latency(112*112,7*7,64,P1,P2),im2col_WS_latency(112*112,7*7,64,P1,P2)))
cost_vector[1].append(np.minimum(im2col_OS_latency(56*56,64,64,P1,P2),im2col_WS_latency(56*56,64,64,P1,P2)))
cost_vector[2].append(np.minimum(im2col_OS_latency(56*56,64*3*3,192,P1,P2),im2col_WS_latency(56*56,64*3*3,192,P1,P2)))

cost_vector[3].append(np.minimum(inception_latency_im2col(192,28,256,64,128,32,32,96,16,P1,P2,"OS"),
	inception_latency_im2col(192,28,256,64,128,32,32,96,16,P1,P2,"WS")))
cost_vector[4].append(np.minimum(inception_latency_im2col(256,28,480,128,192,96,64,128,32,P1,P2,"OS"),
	inception_latency_im2col(256,28,480,128,192,96,64,128,32,P1,P2,"WS")))

cost_vector[5].append(np.minimum(inception_latency_im2col(480,14,512,192,208,48,64,96,16,P1,P2,"OS"),
	inception_latency_im2col(480,14,512,192,208,48,64,96,16,P1,P2,"WS")))
cost_vector[6].append(np.minimum(inception_latency_im2col(512,14,512,160,224,64,64,112,24,P1,P2,"OS"),
	inception_latency_im2col(512,14,512,160,224,64,64,112,24,P1,P2,"WS")))
cost_vector[7].append(np.minimum(inception_latency_im2col(512,14,512,128,256,64,64,128,24,P1,P2,"OS"),
	inception_latency_im2col(512,14,512,128,256,64,64,128,24,P1,P2,"WS")))
cost_vector[8].append(np.minimum(inception_latency_im2col(512,14,528,112,288,64,64,144,32,P1,P2,"OS"),
	inception_latency_im2col(512,14,528,112,288,64,64,144,32,P1,P2,"WS")))
cost_vector[9].append(np.minimum(inception_latency_im2col(528,14,832,256,320,128,128,160,32,P1,P2,"OS"),
	inception_latency_im2col(528,14,832,256,320,128,128,160,32,P1,P2,"WS")))

cost_vector[10].append(np.minimum(inception_latency_im2col(832,7,832,256,320,128,128,160,32,P1,P2,"OS"),
	inception_latency_im2col(832,7,832,256,320,128,128,160,32,P1,P2,"WS")))
cost_vector[11].append(np.minimum(inception_latency_im2col(832,7,1024,384,384,128,128,192,48,P1,P2,"OS"),
	inception_latency_im2col(832,7,1024,384,384,128,128,192,48,P1,P2,"WS")))

	#=========================kn2row============================
cost_vector[0].append(np.minimum(kn2row_OS_latency(112*112,1,64,7,P1,P2),kn2row_WS_latency(112*112,1,64,7,P1,P2)))
cost_vector[1].append(np.minimum(kn2row_OS_latency(56*56,64,64,1,P1,P2),kn2row_WS_latency(56*56,64,64,1,P1,P2)))
cost_vector[2].append(np.minimum(kn2row_OS_latency(56*56,64,192,3,P1,P2),kn2row_WS_latency(56*56,64,192,3,P1,P2)))

cost_vector[3].append(np.minimum(inception_latency_kn2row(192,28,256,64,128,32,32,96,16,P1,P2,"OS"),
	inception_latency_kn2row(192,28,256,64,128,32,32,96,16,P1,P2,"WS")))
cost_vector[4].append(np.minimum(inception_latency_kn2row(256,28,480,128,192,96,64,128,32,P1,P2,"OS"),
	inception_latency_kn2row(256,28,480,128,192,96,64,128,32,P1,P2,"WS")))

cost_vector[5].append(np.minimum(inception_latency_kn2row(480,14,512,192,208,48,64,96,16,P1,P2,"OS"),
	inception_latency_kn2row(480,14,512,192,208,48,64,96,16,P1,P2,"WS")))
cost_vector[6].append(np.minimum(inception_latency_kn2row(512,14,512,160,224,64,64,112,24,P1,P2,"OS"),
	inception_latency_kn2row(512,14,512,160,224,64,64,112,24,P1,P2,"WS")))
cost_vector[7].append(np.minimum(inception_latency_kn2row(512,14,512,128,256,64,64,128,24,P1,P2,"OS"),
	inception_latency_kn2row(512,14,512,128,256,64,64,128,24,P1,P2,"WS")))
cost_vector[8].append(np.minimum(inception_latency_kn2row(512,14,528,112,288,64,64,144,32,P1,P2,"OS"),
	inception_latency_kn2row(512,14,528,112,288,64,64,144,32,P1,P2,"WS")))
cost_vector[9].append(np.minimum(inception_latency_kn2row(528,14,832,256,320,128,128,160,32,P1,P2,"OS"),
	inception_latency_kn2row(528,14,832,256,320,128,128,160,32,P1,P2,"WS")))

cost_vector[10].append(np.minimum(inception_latency_kn2row(832,7,832,256,320,128,128,160,32,P1,P2,"OS"),
	inception_latency_kn2row(832,7,832,256,320,128,128,160,32,P1,P2,"WS")))
cost_vector[11].append(np.minimum(inception_latency_kn2row(832,7,1024,384,384,128,128,192,48,P1,P2,"OS"),
	inception_latency_kn2row(832,7,1024,384,384,128,128,192,48,P1,P2,"WS")))

	#=========================wino============================
cost_vector[0].append(np.minimum((wino_OS_latency(112*112,1,64,m,r,P1,P2)*(7/r)**2+(7/r)**2*(m+r-1)**2*max(m,r))/freq + (112*112*1/m**2+1*64)*(m+r-1)**2*(7/r)**2/bandwidth,
	(wino_WS_latency(112*112,1,64,m,r,P1,P2)*(7/r)**2+(7/r)**2*(m+r-1)**2*max(m,r))/freq + (112*112*1/m**2+1*64)*(m+r-1)**2*(7/r)**2/bandwidth))
cost_vector[1].append(np.minimum(wino_OS_latency(56*56,64,64,1,1,P1,P2)/freq + (56*56*64+64*64)/bandwidth,
	wino_WS_latency(56*56,64,64,1,1,P1,P2)/freq + (56*56*64+64*64)/bandwidth))
cost_vector[2].append(np.minimum((wino_OS_latency(56*56,64,192,m,r,P1,P2)*(3/r)**2+(3/r)**2*(m+r-1)**2*max(m,r))/freq + (56*56*64/m**2+64*192)*(m+r-1)**2/bandwidth,
	(wino_WS_latency(56*56,64,192,m,r,P1,P2)*(3/r)**2+(3/r)**2*(m+r-1)**2*max(m,r))/freq + (56*56*64/m**2+64*192)*(m+r-1)**2/bandwidth))

cost_vector[3].append(np.minimum(inception_latency_wino(192,28,256,64,128,32,32,96,16,P1,P2,"OS",m,r),
	inception_latency_wino(192,28,256,64,128,32,32,96,16,P1,P2,"WS",m,r)))
cost_vector[4].append(np.minimum(inception_latency_wino(256,28,480,128,192,96,64,128,32,P1,P2,"OS",m,r),
	inception_latency_wino(256,28,480,128,192,96,64,128,32,P1,P2,"WS",m,r)))

cost_vector[5].append(np.minimum(inception_latency_wino(480,14,512,192,208,48,64,96,16,P1,P2,"OS",m,r),
	inception_latency_wino(480,14,512,192,208,48,64,96,16,P1,P2,"WS",m,r)))
cost_vector[6].append(np.minimum(inception_latency_wino(512,14,512,160,224,64,64,112,24,P1,P2,"OS",m,r),
	inception_latency_wino(512,14,512,160,224,64,64,112,24,P1,P2,"WS",m,r)))
cost_vector[7].append(np.minimum(inception_latency_wino(512,14,512,128,256,64,64,128,24,P1,P2,"OS",m,r),
	inception_latency_wino(512,14,512,128,256,64,64,128,24,P1,P2,"WS",m,r)))
cost_vector[8].append(np.minimum(inception_latency_wino(512,14,528,112,288,64,64,144,32,P1,P2,"OS",m,r),
	inception_latency_wino(512,14,528,112,288,64,64,144,32,P1,P2,"WS",m,r)))
cost_vector[9].append(np.minimum(inception_latency_wino(528,14,832,256,320,128,128,160,32,P1,P2,"OS",m,r),
	inception_latency_wino(528,14,832,256,320,128,128,160,32,P1,P2,"WS",m,r)))

cost_vector[10].append(np.minimum(inception_latency_wino(832,7,832,256,320,128,128,160,32,P1,P2,"OS",m,r),
	inception_latency_wino(832,7,832,256,320,128,128,160,32,P1,P2,"WS",m,r)))
cost_vector[11].append(np.minimum(inception_latency_wino(832,7,1024,384,384,128,128,192,48,P1,P2,"OS",m,r),
	inception_latency_wino(832,7,1024,384,384,128,128,192,48,P1,P2,"WS",m,r)))
cost_vectors_np = np.array(cost_vector,dtype=object)

sum_2=0
print ("inception_latency_wino(832,7,832,256,320,128,128,160,32,P1,P2,OS,m,r):", inception_latency_wino(832,7,832,256,320,128,128,160,32,P1,P2,"OS",m,r))
# print (arr)
print (cost_vectors_np.shape)
print (cost_vectors_np[0].shape)
# print (cost_vectors_np[0])
print ("layer 1: im2col time:", cost_vectors_np[0][0])
print ("layer 1: kn2row time:", cost_vectors_np[0][1])
print ("layer 1: wino time:", cost_vectors_np[0][2])
# print (cost_vectors_np[1])
print ("layer 2_1: im2col time:", cost_vectors_np[1][0])
print ("layer 2_1: kn2row time:", cost_vectors_np[1][1])
print ("layer 2_1: wino time:", cost_vectors_np[1][2])
# print (cost_vectors_np[2])
print ("layer 2_2: im2col time:", cost_vectors_np[2][0])
print ("layer 2_2: kn2row time:", cost_vectors_np[2][1])
print ("layer 2_2: wino time:", cost_vectors_np[2][2])
print ("layer 3 cost vector shape:",cost_vectors_np[3].shape) #[0][0]->[0][5]: 6 CONV blocks in inception #[0][0]->[2][0]: for one CONV block, 3 algos: im,kn,wi
# print (cost_vectors_np[3])

sum_2+=cost_vectors_np[0][2]
sum_2+=cost_vectors_np[1][2]
sum_2+=cost_vectors_np[2][2]
print ("layer 3 total: im2col time:", sum(cost_vectors_np[3][0]))
print ("layer 3 total: kn2row time:", sum(cost_vectors_np[3][1]))
print ("layer 3 total: wino time:", sum(cost_vectors_np[3][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[3][i][j] for i in range(len(cost_vectors_np[3])))
print ("layer 3 mixed total",sum_1)
sum_2+=sum_1

# print(cost_vectors_np[4])
print("layer 4 total: im2col time:", sum(cost_vectors_np[4][0]))
print("layer 4 total: kn2row time:", sum(cost_vectors_np[4][1]))
print("layer 4 total: wino time:", sum(cost_vectors_np[4][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[4][i][j] for i in range(len(cost_vectors_np[4])))
print("layer 4 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[5])
print ("layer 5 total: im2col time:", sum(cost_vectors_np[5][0]))
print ("layer 5 total: kn2row time:", sum(cost_vectors_np[5][1]))
print ("layer 5 total: wino time:", sum(cost_vectors_np[5][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[5][i][j] for i in range(len(cost_vectors_np[5])))
print ("layer 5 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[6])
print ("layer 6 total: im2col time:", sum(cost_vectors_np[6][0]))
print ("layer 6 total: kn2row time:", sum(cost_vectors_np[6][1]))
print ("layer 6 total: wino time:", sum(cost_vectors_np[6][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[6][i][j] for i in range(len(cost_vectors_np[6])))
print ("layer 6 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[7])
print ("layer 7 total: im2col time:", sum(cost_vectors_np[7][0]))
print ("layer 7 total: kn2row time:", sum(cost_vectors_np[7][1]))
print ("layer 7 total: wino time:", sum(cost_vectors_np[7][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[7][i][j] for i in range(len(cost_vectors_np[7])))
print ("layer 7 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[8])
print ("layer 8 total: im2col time:", sum(cost_vectors_np[8][0]))
print ("layer 8 total: kn2row time:", sum(cost_vectors_np[8][1]))
print ("layer 8 total: wino time:", sum(cost_vectors_np[8][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[8][i][j] for i in range(len(cost_vectors_np[8])))
print ("layer 8 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[9])
print ("layer 9 total: im2col time:", sum(cost_vectors_np[9][0]))
print ("layer 9 total: kn2row time:", sum(cost_vectors_np[9][1]))
print ("layer 9 total: wino time:", sum(cost_vectors_np[9][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[9][i][j] for i in range(len(cost_vectors_np[9])))
print ("layer 9 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[10])
print ("layer 10 total: im2col time:", sum(cost_vectors_np[10][0]))
print ("layer 10 total: kn2ro time:", sum(cost_vectors_np[10][1]))
print ("layer 10 total: wino time:", sum(cost_vectors_np[10][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[10][i][j] for i in range(len(cost_vectors_np[10])))
print ("layer 10 mixed total",sum_1)
sum_2+=sum_1

# print (cost_vectors_np[11])
print ("layer 11 total: im2col time:", sum(cost_vectors_np[11][0]))
print ("layer 11 total: kn2row time:", sum(cost_vectors_np[11][1]))
print ("layer 11 total: wino time:", sum(cost_vectors_np[11][2]))
# for i in range(6): #number of blocks
sum_1=0
for j in range(6): #numebr of CONV blocks
	sum_1+=min(cost_vectors_np[11][i][j] for i in range(len(cost_vectors_np[11])))
print ("layer 11 mixed total",sum_1)
sum_2+=sum_1


algos=["im2col","kn2row","wino"]
transition_vector=[[],[],[]] #Try(test): CONV0-1,1-2 + inception 3a all 10 edges
# transition vector [l][x][y] 
# l=0,1, transition_vector[l] len=3x3 transition matrice
# l>1: transition_vector[l] len=10 edges per inception module,  transition_vector[l][x] len=3x3 transition matrice
for i in range(len(algos)):
	for j in range(len(algos)):
		transition_vector[0].append(transition_cost(64,56,64,1,1,0, algos[i], algos[j], m,r))
		transition_vector[1].append(transition_cost(64,56,64,3,1,0, algos[i], algos[j], m,r))
# inception_latency_im2col(C_in,o,C_out,num_1by1,num_3by3,num_5by5,pool_proj,reduce_3by3,reduce_5by5,P1,P2,df)
# inception_latency_im2col(192,28,256,64,128,32,32,96,16,P1,P2,"OS")
transition_vector[2]=inception_T_3a(192,28,256,64,128,32,32,96,16,m,r)
# transition_vector[2]=inception_T_other(192,28,256,64,128,32,32,96,16,m,r)

print(len(transition_vector[0]),len(transition_vector[1]),len(transition_vector[2]),len(transition_vector[2][0]))
print(transition_vector[0])
print(transition_vector[1],"\n")
print(transition_vector[2][0])
print(transition_vector[2][1],"\n")
print(transition_vector[2][2])
print(transition_vector[2][3])
print(transition_vector[2][4],"\n")
print("sum2:", sum_2)

CONV1 params: 3136
CONV1 operations: 39.337984
CONV2 params: 114688
CONV2 operations: 359.661568
inception_latency_wino(832,7,832,256,320,128,128,160,32,P1,P2,OS,m,r): [0.030588999999999998, 0.019357, 0.115882, 0.0043809999999999995, 0.026405555555555557, 0.015612999999999998]
(12, 3)
(3,)
layer 1: im2col time: 0.045325000000000004
layer 1: kn2row time: 0.035869000000000005
layer 1: wino time: 0.02142933333333334
layer 2_1: im2col time: 0.014976
layer 2_1: kn2row time: 0.014976
layer 2_1: wino time: 0.014976
layer 2_2: im2col time: 0.347904
layer 2_2: kn2row time: 0.32128
layer 2_2: wino time: 0.163264
layer 3 cost vector shape: (3,)
layer 3 total: im2col time: 0.152052
layer 3 total: kn2row time: 0.136404
layer 3 total: wino time: 0.10816355555555557
layer 3 mixed total 0.10816355555555557
layer 4 total: im2col time: 0.33243999999999996
layer 4 total: kn2row time: 0.307416
layer 4 total: wino time: 0.21056355555555556
layer 4 mixed total 0.21056355555555556
layer 5 total: im2col time:

# File Writing: .in for PBQP solver

In [10]:
f= open("GN_incep.in","w+")
f.write("10\n") #num nodes
f.write("12\n") #num edges
single_conv_node_cost_write(f,cost_vectors_np[0],0)
single_conv_node_cost_write(f,cost_vectors_np[1],1)
single_conv_node_cost_write(f,cost_vectors_np[2],2)
inception_node_cost_write(f,cost_vectors_np[3],0,3)
# This is the last converging node in inception
f.write("1\n")
f.write("0\n\n")

f.write("0 1\n")
f.write("3 3\n")
for i in range(len(transition_vector[0])):
	f.write("%f " %transition_vector[0][i])
f.write("\n1 2\n")
f.write("3 3\n")
for i in range(len(transition_vector[1])):
	f.write("%f " %transition_vector[1][i])
f.write("\n")
inception_3a_T_cost_write(f,transition_vector[2],0,2)

f.close()

