In [1]:
import gurobipy as gu
from gurobipy import GRB
import pandas as pd
from datetime import datetime
import numpy as np
import os.path as op
import os

directory = '/data/bionets/datasets/galaxy/'
configuration_dir = op.join(directory, 'new_configurations')
os.makedirs(configuration_dir, exist_ok = True)


In [2]:

jan2021 = pd.read_csv('/data/bionets/ib27yhof/galaxy-temp-data/ci_test_2020_01.csv')
jan2021['run_min'] = jan2021.tool_runtime_in_seconds/60
jan2021['cpu_percent'] = jan2021.tool_cpu_usage_in_nanoseconds/(jan2021.tool_runtime_in_seconds*10**9)


In [3]:

sub = jan2021[jan2021.job_created_at<='2020-01-02']
sub = sub[sub.job_ended_at<='2020-01-02']

sub['job_started_to_run_at'] =pd.to_datetime(sub['job_started_to_run_at'])
sub['job_ended_at'] =pd.to_datetime(sub['job_ended_at'])


In [4]:
computers = pd.read_csv('/data/bionets/datasets/galaxy/hostname-config-full.csv')
ul = np.where(['upload' not in x for x in computers.hostname])[0]
computers = computers.iloc[ul,:]
computers = computers[computers.hostname.isin(sub.hostname)]
computers = computers.reset_index()
computers['host_number'] = range(computers.shape[0])
max_cpu_per_machine = {}
max_ram_per_machine = {}

for c in range(len(computers.hostname)):
    max_ram_per_machine[computers.hostname[c]] = int(computers.approx_memory[c])
    max_cpu_per_machine[computers.hostname[c]] = int(computers.approx_cores[c])

In [5]:
sub = sub[sub.hostname.isin(computers.hostname)]
sub = sub.merge(computers.loc[:,['hostname', 'host_number']], on = 'hostname')

In [11]:
ntimeperiods = 24*60
job_delay = 12*60
# the job data is in seconds.
min_timeslots = 1
sec_timslots = 60
timeslot_factor = min_timeslots*sec_timslots
#carbon intensity is reported in 15 minute intervals
##CAUTION. Not 
ci_repeat_factor = 15/(min_timeslots*(sec_timslots/60))

In [None]:
ci_repeat_factor

In [8]:
len(dd)

NameError: name 'dd' is not defined

In [12]:


dd = [int(np.floor(pd.Timedelta(pd.to_datetime(x)- pd.to_datetime(sub.job_started_to_run_at[0])).total_seconds()/timeslot_factor)) for x in sub.job_started_to_run_at]
sub["arrival_time"] = dd

# list of machines
nmachines = len(pd.unique(sub.hostname))
machines = computers.hostname.tolist()
njobs = len(pd.unique(sub.job_id))

# variables indexed by job
average_load = list(sub.cpu_percent)
requested_cpus = list(sub.tool_requested_cores)
requested_memory = list(sub.tool_requested_memory)

# variables indexed by timeslots
job_durations = list(sub.tool_runtime_in_seconds)
job_duration_time_slots = [int(np.ceil(j/timeslot_factor)) for j in job_durations]
sub['job_duration'] = job_duration_time_slots
job_arrival_times = list(sub.arrival_time)
allowed_starting_times = [[int(job_arrival_times[j]+i) for i in range(job_delay)] for j in range(njobs)]
allowed_running_times = [[int(job_arrival_times[j]+i) for i in range(job_delay)] + [allowed_starting_times[j][-1]+d for  d in range(1,job_duration_time_slots[j])] for j in range(njobs)]

carbon_intensity = pd.read_csv('/data/bionets/og86asub/galaxy_optimization/DE-actual-15.csv')
carbon_intensity = carbon_intensity.loc[:, ['startTime', 'ci1', 'ci2', 'ci3', 'ci4', 'ci5']]
carbon_intensity['start_timestamp'] = carbon_intensity['startTime'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H%M'))
carbon_intensity['start_timestamp'] = pd.to_datetime(carbon_intensity['start_timestamp'])
carbon_intensity = carbon_intensity.loc[np.repeat(carbon_intensity.index, ci_repeat_factor)].reset_index(drop=True)
carbon_intensity = carbon_intensity[carbon_intensity.start_timestamp>=(pd.to_datetime(sub.job_started_to_run_at.min())-pd.Timedelta(minutes = 15))]
carbon_intensity_per_timeslot = list(carbon_intensity.ci3)
carbon_intensity = carbon_intensity[carbon_intensity.start_timestamp<=sub.job_ended_at.max()]


In [14]:
sub = sub.reset_index()

In [13]:
# Models
m = gu.Model('galaxy')
m.setParam('TimeLimit', 120*60)
# add a binary variable which tells us if job has started on machine M
x = m.addVars(
    [
    (j, nm, a)
    for j in range(njobs)
    for nm in range(nmachines)
    for a in allowed_starting_times[j]
    ],
    vtype=GRB.BINARY, name="x"
)

y = m.addVars(
    [
    (j, nm, a)
    for j in range(njobs)
    for nm in range(nmachines)
    for a in allowed_running_times[j]
    ],
    vtype=GRB.BINARY, name="y",
)

Set parameter TimeLimit to value 7200


In [None]:
y[(0,0,t)]

In [15]:
print('Adding CPU and RAM constraints')
## Do not exceed max capacity of servers
## (5)
yk = gu.tuplelist(y.keys())
max_cpu = m.addConstrs(gu.quicksum(requested_cpus[j]*y[j,m,t]  for j,m, t in yk.select('*', m, t)) <= max_cpu_per_machine[machines[m]] for m in range(nmachines) for t in range(ntimeperiods))

## max ram per machine cannot be exceeded
# (6)
max_ram = m.addConstrs(gu.quicksum(requested_memory[j]*y[j,m,t]  for j,m, t in yk.select('*', m, t) ) <= max_ram_per_machine[machines[m]] for m in range(nmachines) for  t in range(ntimeperiods))

Adding CPU and RAM constraints


KeyError: -1

In [16]:
## (2)
for j in range(njobs):
    jobstart = m.addConstr(gu.quicksum(x.select(j, '*', '*')) == 1)

# (3)
for j, i, tp in x.keys():
    for t in range(job_duration_time_slots[j]):
        tt = tp+t
        if (j,i,tt) in y and (j, i, tp) in x:
            timedelay = m.addConstr(y[j,i,tt]>=x[j,i,tp])

print('Defining objective')
#(1)
carbon_emission = gu.quicksum(carbon_intensity_per_timeslot[t]*average_load[j]*y[j,i, t] for j,i,t in y.keys() )

m.setObjective(carbon_emission, GRB.MINIMIZE)

Defining objective


In [17]:
for j in range(njobs):
    for nm in range(nmachines):
        for a in allowed_starting_times[j]:
            x[(j, nm, a)].Start = 0

In [None]:
for j in range(njobs):
    for nm in range(nmachines):
        for a in allowed_running_times[j]:
            y[(j, nm, a)].Start = 0

In [22]:
sub['job_number'] = range(sub.shape[0])
sub = sub.reset_index()


In [42]:

for ix, row in sub.iterrows():

    x[(row['job_number'], row['host_number'], row['arrival_time'])].Start = 1.0
    #x[(row['job_number'], row['host_number'], row['arrival_time'])].lb = 1.0
    #x[(row['job_number'], row['host_number'], row['arrival_time'])].ub = 1.0
    #print(x[(row['job_number'], row['host_number'], row['arrival_time'])].Start)
    for i in range(row['job_duration']):
        at = int(row['arrival_time']+i)
        
        y[(row['job_number'], row['host_number'],at)].Start = 1.0
        #y[(row['job_number'], row['host_number'],at)].ub = 1.0
        #y[(row['job_number'], row['host_number'],at)].lb = 1.0

#for (j, nm, a), value in initial_y.items():
#    y[j, nm, a].start = value

In [46]:
m.update()

In [60]:
sub.loc[sub['host_number']==40, ['tool_requested_cores', 'job_number', 'host_number', 'job_duration', 'arrival_time']]

Unnamed: 0,tool_requested_cores,job_number,host_number,job_duration,arrival_time
394,10.0,394,40,4,1417
402,10.0,402,40,1,1422
498,1.0,498,40,1,1425
501,10.0,501,40,4,1419
502,10.0,502,40,4,1419
506,10.0,506,40,3,1426
508,10.0,508,40,1,1428
510,10.0,510,40,1,1422
512,10.0,512,40,1,1423
514,10.0,514,40,1,1428


In [55]:
con = m.getConstrByName('R59022')
print(f"{m.getRow(con)} {con.Sense} {con.RHS}")

y[71,40,1422] + y[72,40,1422] + 10.0 y[205,40,1422] + y[252,40,1422] + y[269,40,1422] + y[294,40,1422] + y[306,40,1422] + y[351,40,1422] + y[352,40,1422] + 8.0 y[357,40,1422] + y[367,40,1422] + y[374,40,1422] + y[375,40,1422] + y[376,40,1422] + y[377,40,1422] + y[378,40,1422] + y[386,40,1422] + 10.0 y[394,40,1422] + 10.0 y[402,40,1422] + 10.0 y[406,40,1422] + 10.0 y[407,40,1422] + 10.0 y[408,40,1422] + 10.0 y[409,40,1422] + y[433,40,1422] + y[440,40,1422] + y[442,40,1422] + y[443,40,1422] + y[445,40,1422] + 10.0 y[448,40,1422] + 10.0 y[450,40,1422] + 10.0 y[451,40,1422] + 10.0 y[452,40,1422] + 10.0 y[453,40,1422] + 10.0 y[454,40,1422] + 10.0 y[455,40,1422] + 10.0 y[456,40,1422] + 10.0 y[458,40,1422] + 10.0 y[459,40,1422] + 10.0 y[460,40,1422] + 10.0 y[462,40,1422] + 10.0 y[463,40,1422] + y[472,40,1422] + y[473,40,1422] + y[474,40,1422] + y[475,40,1422] + y[476,40,1422] + y[477,40,1422] + y[478,40,1422] + y[479,40,1422] + y[480,40,1422] + 8.0 y[481,40,1422] + y[482,40,1422] + y[483,40,1

In [49]:
import re

# Example string
input_str = "y[71,40,1422] + y[72,40,1422] + 10.0 y[205,40,1422] + y[252,40,1422] + y[269,40,1422] + y[294,40,1422] + y[306,40,1422] + y[351,40,1422] + y[352,40,1422] + 8.0 y[357,40,1422] + y[367,40,1422] + y[374,40,1422] + y[375,40,1422] + y[376,40,1422] + y[377,40,1422] + y[378,40,1422] + y[386,40,1422] + 10.0 y[394,40,1422] + 10.0 y[402,40,1422] + 10.0 y[406,40,1422] + 10.0 y[407,40,1422] + 10.0 y[408,40,1422] + 10.0 y[409,40,1422] + y[433,40,1422] + y[440,40,1422] + y[442,40,1422] + y[443,40,1422] + y[445,40,1422] + 10.0 y[448,40,1422] + 10.0 y[450,40,1422] + 10.0 y[451,40,1422] + 10.0 y[452,40,1422] + 10.0 y[453,40,1422] + 10.0 y[454,40,1422] + 10.0 y[455,40,1422] + 10.0 y[456,40,1422] + 10.0 y[458,40,1422] + 10.0 y[459,40,1422] + 10.0 y[460,40,1422] + 10.0 y[462,40,1422] + 10.0 y[463,40,1422] + y[472,40,1422] + y[473,40,1422] + y[474,40,1422] + y[475,40,1422] + y[476,40,1422] + y[477,40,1422] + y[478,40,1422] + y[479,40,1422] + y[480,40,1422] + 8.0 y[481,40,1422] + y[482,40,1422] + y[483,40,1422] + y[484,40,1422] + y[485,40,1422] + y[486,40,1422] + y[487,40,1422] + y[488,40,1422] + y[489,40,1422] + y[490,40,1422] + y[491,40,1422] + y[492,40,1422] + y[493,40,1422] + y[494,40,1422] + y[495,40,1422] + y[496,40,1422] + y[497,40,1422] + y[499,40,1422] + 10.0 y[501,40,1422] + 10.0 y[502,40,1422] + 10.0 y[503,40,1422] + 10.0 y[504,40,1422] + 10.0 y[505,40,1422] + 10.0 y[507,40,1422] + 10.0 y[510,40,1422] + 10.0 y[511,40,1422] + 10.0 y[513,40,1422] + 10.0 y[515,40,1422] + y[516,40,1422] + y[517,40,1422] + y[518,40,1422] + y[519,40,1422] + y[520,40,1422] + y[521,40,1422] + y[522,40,1422] + y[523,40,1422] + y[524,40,1422] + y[525,40,1422] + y[527,40,1422] + y[528,40,1422] + y[529,40,1422] + y[531,40,1422] + 10.0 y[532,40,1422] + 10.0 y[535,40,1422] + 10.0 y[536,40,1422] + 10.0 y[537,40,1422] + 10.0 y[539,40,1422] + 10.0 y[544,40,1422] + 10.0 y[545,40,1422] + 10.0 y[547,40,1422] + y[549,40,1422] + 4.0 y[550,40,1422] + y[551,40,1422] + y[552,40,1422] + y[553,40,1422] + y[554,40,1422] + y[555,40,1422] + y[567,40,1422] + y[568,40,1422] + y[569,40,1422] + y[570,40,1422] + y[571,40,1422] + y[572,40,1422] + y[573,40,1422] + y[574,40,1422] + y[575,40,1422] + y[576,40,1422] + y[577,40,1422] + y[578,40,1422] + y[579,40,1422] + y[580,40,1422] + y[581,40,1422] + y[582,40,1422] + y[583,40,1422] + y[584,40,1422] + y[585,40,1422] + y[586,40,1422] + y[587,40,1422] + y[588,40,1422] + y[589,40,1422] + y[590,40,1422] + y[591,40,1422] + y[592,40,1422] + y[593,40,1422] + y[594,40,1422] + y[595,40,1422]"# Regex pattern to match the variable and its optional coefficient
pattern = re.compile(r'(?:(\d*\.\d+|\d+)\s*)?y\[(\d+),(\d+),(\d+)\]')

# Find matches
matches = pattern.findall(input_str)

# Extract and convert matches to tuples with coefficient
result = [(float(match[0]) if match[0] else 1.0, (int(match[1]), int(match[2]), int(match[3]))) for match in matches]

print(result)


[(1.0, (71, 40, 1422)), (1.0, (72, 40, 1422)), (10.0, (205, 40, 1422)), (1.0, (252, 40, 1422)), (1.0, (269, 40, 1422)), (1.0, (294, 40, 1422)), (1.0, (306, 40, 1422)), (1.0, (351, 40, 1422)), (1.0, (352, 40, 1422)), (8.0, (357, 40, 1422)), (1.0, (367, 40, 1422)), (1.0, (374, 40, 1422)), (1.0, (375, 40, 1422)), (1.0, (376, 40, 1422)), (1.0, (377, 40, 1422)), (1.0, (378, 40, 1422)), (1.0, (386, 40, 1422)), (10.0, (394, 40, 1422)), (10.0, (402, 40, 1422)), (10.0, (406, 40, 1422)), (10.0, (407, 40, 1422)), (10.0, (408, 40, 1422)), (10.0, (409, 40, 1422)), (1.0, (433, 40, 1422)), (1.0, (440, 40, 1422)), (1.0, (442, 40, 1422)), (1.0, (443, 40, 1422)), (1.0, (445, 40, 1422)), (10.0, (448, 40, 1422)), (10.0, (450, 40, 1422)), (10.0, (451, 40, 1422)), (10.0, (452, 40, 1422)), (10.0, (453, 40, 1422)), (10.0, (454, 40, 1422)), (10.0, (455, 40, 1422)), (10.0, (456, 40, 1422)), (10.0, (458, 40, 1422)), (10.0, (459, 40, 1422)), (10.0, (460, 40, 1422)), (10.0, (462, 40, 1422)), (10.0, (463, 40, 1422)

In [52]:
summi = 0
for r in result:
    summi += r[0]* y[r[1]].Start

In [53]:
summi

61.0

In [137]:
m.write("myfile.lp")

In [54]:
m.optimize()

Gurobi Optimizer version 11.0.2 build v11.0.2rc0 (linux64 - "Ubuntu 22.04.4 LTS")

CPU model: AMD EPYC 7402 24-Core Processor, instruction set [SSE2|AVX|AVX2]
Thread count: 48 physical cores, 96 logical processors, using up to 32 threads

Optimize a model with 5787222 rows, 1112326 columns and 13191972 nonzeros
Model fingerprint: 0xef8847f4
Variable types: 0 continuous, 1112326 integer (1112326 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+05]
  Objective range  [4e-01, 2e+07]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 2e+06]

User MIP start did not produce a new incumbent solution
User MIP start violates constraint R59022 by 9.000000000

Found heuristic solution: objective 7.389099e+07
Presolve removed 116730 rows and 753 columns
Presolve time: 1.44s

Explored 0 nodes (0 simplex iterations) in 3.08 seconds (3.77 work units)
Thread count was 1 (of 96 available processors)

Solution count 1: 7.3891e+07 

Solve interrupted
Best objective 7.389099395705e+

In [None]:
#def create_new_configuration_df(m, carbon_intensity_per_timeslot):
xl = []
yl = []
for v in m.getVars():
    if v.VarName.startswith('x'):
        if v.x > 0:
            xl.append(split_line(v.VarName))
    if v.VarName.startswith('y'):
        if v.x > 0:
            yl.append(split_line(v.VarName))
data = pd.DataFrame(xl)
data.columns = ['job', 'machine', 'time_slot']
data['carbon_intensity_new'] = data['time_slot'].apply(lambda x: carbon_intensity_per_timeslot[int(x)])
#return data
    

In [None]:
def reverse_operation(dd, timeslot_factor, start_time):
    timedelta = pd.Timedelta(seconds=dd * timeslot_factor)
    new_date = start_time + timedelta
    return new_date
    
def split_line(line):
    parts = line.replace('[', ',').replace(']', '').split(',')
    name = parts[0].strip()
    numbers = [int(part.strip()) for part in parts[1:]]
    return  numbers

In [None]:
def calculateEnergyConsumption1(runtime_seconds, requested_cores):
    # result in KWh
    # Assumptions
    PUE = 1.6 # Power Usage Efficiency of the data center
    P_m = 0.3725 # Memory power draw,Watts/GB  
    P_c =  15.8  # In Watts Assuming Core-i5-10600K  ; value taken from http://calculator.green-algorithms.org/
    u_c = 0.50 # core usage factor 

    n_c = 'average load over runtime'
    t_hour = runtime_seconds / 3600
    def ceil_to_power_of_2(num):
        return int(2 ** np.ceil(np.log2(num)))
    #n_m = ceil_to_power_of_2(row["tool_requested_memory"])/1024    # size of memory avaialbe in GB 
    power = t_hour * (n_c * P_c ) * PUE * 0.001
    return power

In [None]:
configuration = create_new_configuration_df(m, carbon_intensity_per_timeslot)
configuration.to_csv(op.join(configuration_dir, 'jan_2021.tsv'), sep='\t')
data = pd.concat([configuration, sub], axis = 1)
data = data.dropna()
data = data.reset_index()
data['power_draw'] = [calculateEnergyConsumption1(data['tool_runtime_in_seconds'][i], data['tool_requested_cores'][i]) for i in range(data.shape[0])]
data['carbon_intensity_old'] = data['arrival_time'].apply(lambda x: carbon_intensity_per_timeslot[int(x)])



In [None]:
configuration

In [None]:
data['carbon_emission_orig'] = data['power_draw']*data['carbon_intensity_old']
data['carbon_emission'] = data['power_draw']*data['carbon_intensity']


In [None]:
np.sum(data['carbon_emission'])/1000

In [None]:
np.sum(data['carbon_emission_orig'])/1000