In [47]:
#!/usr/bin/env python3

import pandas as pd
import errno
import os, sys


'''
Author: Enrico Ceccolini
    Given the raw data of Eurora (CORE_MEASUREMENT_FULL)
    and a time interval, it returns only the measurements
    of that period, sampled at 1 minute intervals
'''

interval_comment = "April"
start_time = pd.to_datetime('2014-04-01')
end_time = pd.to_datetime('2014-05-01')

datadir = "/datasets/eurora_data/db/"

cpu = '01'
infile = datadir + "CPUs/cpu" + cpu + ".csv"

# read dataset
whole_node_power_data = pd.read_csv(infile) # Read a comma-separated values (csv) file into DataFrame
print("there are {} jobs in the whole dataset".format(whole_node_power_data.shape[0]))

there are 7158716 jobs in the whole dataset


In [48]:
whole_node_power_data.head(1)

Unnamed: 0,01,cpu_id,timestamp,pow_cpu,pow_dram,pow_pkg
0,1,0,2014-03-24 13:56:30,2.016754,2.139374,17.738455


In [49]:
whole_node_power_data['01'].sum()

7158716

In [50]:
whole_node_power_data = whole_node_power_data.drop([str(cpu)], axis=1)

In [51]:
# drop useless columns
# whole_node_power_data = whole_node_power_data.drop(['01'], axis=1)

I notice that there are duplicated rows (same timestamp and cpu_id, quite strange).

In [52]:
# drop the duplicates rows (same timestamp, cpu_id, ...)
whole_node_power_data = whole_node_power_data.drop_duplicates()
print("data rows after drop_duplicates {}".format(whole_node_power_data.shape[0]))

data rows after drop_duplicates 7057475


drop the rows before the selected started time

In [75]:
# select data in the specified interval
#interval_data = whole_node_power_data[pd.to_datetime(whole_node_power_data['timestamp']) >= start_time]
#interval_data = interval_data[pd.to_datetime(interval_data['timestamp']) <= end_time]
#print("data row from {} to {}: {}".format(start_time, end_time, interval_data.shape[0]))

data row from 2014-04-01 00:00:00 to 2014-05-01 00:00:00: 1019287


In [76]:
# this is faster
interval_data = whole_node_power_data.loc[(pd.to_datetime(whole_node_power_data['timestamp']) >= start_time) & (pd.to_datetime(whole_node_power_data['timestamp']) <= end_time)]
print("data row from {} to {}: {}".format(start_time, end_time, interval_data.shape[0]))

data row from 2014-04-01 00:00:00 to 2014-05-01 00:00:00: 1019287


create two different dataFrame. One for cpu0 and one for cpu1

In [77]:
# split the data referring to the cpu0 and cpu0
cpu0 = interval_data[interval_data['cpu_id'] == 0]
cpu1 = interval_data[interval_data['cpu_id'] == 1]
print("data row after split: {}".format(cpu0.shape[0]))
print("data row after split: {}".format(cpu1.shape[0]))

data row after split: 509601
data row after split: 509686


In [78]:
# drop the duplicates rows 
cpu0 = cpu0.drop_duplicates(subset=['timestamp'])
cpu1 = cpu1.drop_duplicates(subset=['timestamp'])
print("data row after split: {}".format(cpu0.shape[0]))
print("data row after split: {}".format(cpu1.shape[0]))

data row after split: 509525
data row after split: 509525


convert the timestamp column to the pd datetime format

In [79]:
# convert the timestamp column to the Pandas timestamp format
cpu0.timestamp = pd.to_datetime(cpu0.timestamp)
cpu1.timestamp = pd.to_datetime(cpu1.timestamp)
cpu0.head(1)

Unnamed: 0,cpu_id,timestamp,pow_cpu,pow_dram,pow_pkg
290456,0,2014-04-01 00:00:04,3.400706,2.757682,19.308012


In [80]:
cpu0 = cpu0.set_index(["timestamp"])
cpu1 = cpu1.set_index(["timestamp"])
cpu0.head(5)

Unnamed: 0_level_0,cpu_id,pow_cpu,pow_dram,pow_pkg
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-04-01 00:00:04,0,3.400706,2.757682,19.308012
2014-04-01 00:00:09,0,2.020461,2.535905,17.75341
2014-04-01 00:00:16,0,2.066831,2.577469,17.766197
2014-04-01 00:00:21,0,1.983965,2.496961,17.676901
2014-04-01 00:00:26,0,2.166701,2.550643,17.985893


In [81]:
# resample data from 5 sec to 1 min, using the mean for the new values 
cpu0_1min = cpu0.resample('1Min').mean()
print("data rows {}".format(cpu0_1min.shape[0]))

data rows 43201


In [82]:
cpu1_1min = cpu1.resample('1Min').mean()
print("data rows {}".format(cpu1_1min.shape[0]))

data rows 43201


In [83]:
cpu0_1min.head(5)

Unnamed: 0_level_0,cpu_id,pow_cpu,pow_dram,pow_pkg
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-04-01 00:00:00,0.0,2.155305,2.507122,17.899565
2014-04-01 00:01:00,0.0,2.03338,2.47214,17.762189
2014-04-01 00:02:00,0.0,2.03756,2.526941,17.769899
2014-04-01 00:03:00,0.0,2.030268,2.487323,17.759689
2014-04-01 00:04:00,0.0,1.985083,2.500854,17.683834


In [84]:
cpu1_1min.head(5)

Unnamed: 0_level_0,cpu_id,pow_cpu,pow_dram,pow_pkg
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-04-01 00:00:00,1.0,2.461377,3.495659,17.988706
2014-04-01 00:01:00,1.0,2.43269,3.470205,17.946297
2014-04-01 00:02:00,1.0,2.400015,3.458248,17.913687
2014-04-01 00:03:00,1.0,2.395853,3.480569,17.908572
2014-04-01 00:04:00,1.0,2.357162,3.465722,17.842077


In [85]:
cpu0_1min = cpu0_1min.drop(['cpu_id'], axis=1)
cpu1_1min = cpu1_1min.drop(['cpu_id'], axis=1)

In [86]:
cpu0_1min = cpu0_1min.rename(index=str, columns={"pow_cpu": "pow_cpu_0", "pow_dram": "pow_dram_0", "pow_pkg": "pow_pkg_0"})
cpu1_1min = cpu1_1min.rename(index=str, columns={"pow_cpu": "pow_cpu_1", "pow_dram": "pow_dram_1", "pow_pkg": "pow_pkg_1"})
# cpu0_1min.head(1)

In [87]:
# concat
node_1min = pd.concat([cpu0_1min, cpu1_1min], axis=1)

In [88]:
node_1min.head(1)

Unnamed: 0_level_0,pow_cpu_0,pow_dram_0,pow_pkg_0,pow_cpu_1,pow_dram_1,pow_pkg_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-01 00:00:00,2.155305,2.507122,17.899565,2.461377,3.495659,17.988706


In [89]:
node_1min['pow_tot_0'] = cpu0_1min['pow_cpu_0'] + cpu0_1min['pow_dram_0'] + cpu0_1min['pow_pkg_0'] 
node_1min['pow_tot_1'] = cpu1_1min['pow_cpu_1'] + cpu1_1min['pow_dram_1'] + cpu1_1min['pow_pkg_1']

In [90]:
node_1min.head(1)

Unnamed: 0_level_0,pow_cpu_0,pow_dram_0,pow_pkg_0,pow_cpu_1,pow_dram_1,pow_pkg_1,pow_tot_0,pow_tot_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-04-01 00:00:00,2.155305,2.507122,17.899565,2.461377,3.495659,17.988706,22.561993,23.945743


In [91]:
node_1min = node_1min.drop(['pow_cpu_0', 'pow_dram_0', 'pow_pkg_0', 'pow_cpu_1', 'pow_dram_1', 'pow_pkg_1'], axis=1)

In [71]:
node_1min.head(1)

Unnamed: 0_level_0,pow_tot_0,pow_tot_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-01 00:00:00,22.561993,23.945743


In [19]:
pow_tot_1 = cpu0_1min['pow_cpu_0'] + cpu0_1min['pow_dram_0'] + cpu0_1min['pow_pkg_0']
pow_tot_2 = cpu1_1min['pow_cpu_1'] + cpu1_1min['pow_dram_1'] + cpu1_1min['pow_pkg_1']
print("first: {}, second: {}".format(pow_tot_1.shape, pow_tot_2.shape))

first: (43200,), second: (43200,)


In [41]:
# write
#outfile = datadir + 'CPUs/' + interval_comment + "/node" + cpu + "_1min_" + interval_comment + ".csv"
#node_1min.to_csv(outfile)