In [37]:
#!/usr/bin/env python3

import pandas as pd
import errno    
import os, sys

'''
Author: Enrico Ceccolini
    Given the raw data of Eurora (CORE_MEASUREMENT_FULL)
    and a time interval, it returns only the measurements
    of that period, sampled at 1 minute intervals
'''


datadir = "/datasets/eurora_data/db/"

jobs_file = datadir + 'jobs.csv'

# interval_comment = "April_new"
# start_time = pd.to_datetime('2014-04-01')
# end_time = pd.to_datetime('2014-05-01')
interval_comment = "Whole"
start_time = pd.to_datetime('2014-03-31')
end_time = pd.to_datetime('2015-08-11')

# mkdir_p(datadir + 'CPUs/' + interval_comment)

# notice that 43 doesn't exists in the db
cpus=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
 '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', 
 '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', 
 '36', '37', '38', '39', '40', '41', '42', '44', '45', '46', '47', '48', 
 '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', 
 '61', '62', '63', '64']
cpus=['05'] # test

cpu = '05'

print(cpu)
infile = datadir + "CPUs/cpu" + cpu + ".csv"

# read dataset
whole_node_power_data = pd.read_csv(infile) # Read a comma-separated values (csv) file into DataFrame
print("data rows {}".format(whole_node_power_data.shape))

# drop useless columns
whole_node_power_data = whole_node_power_data.drop([str(cpu)], axis=1)

# drop the duplicates rows (same timestamp, cpu_id, ...)
whole_node_power_data = whole_node_power_data.drop_duplicates() 
print("data rows after drop_duplicates {}".format(whole_node_power_data.shape[0]))

# select data in the specified interval
interval_data = whole_node_power_data.loc[(pd.to_datetime(whole_node_power_data['timestamp']) >= start_time) & (pd.to_datetime(whole_node_power_data['timestamp']) <= end_time)]
print("data row from {} to {}: {}".format(start_time, end_time, interval_data.shape[0]))

# split the data referring to the cpu0 and cpu0
cpu0 = interval_data[interval_data['cpu_id'] == 0]
cpu1 = interval_data[interval_data['cpu_id'] == 1]
print("data row after split: {}".format(cpu0.shape[0]))
print("data row after split: {}".format(cpu1.shape[0]))

# drop the duplicates rows 
cpu0 = cpu0.drop_duplicates(subset=['timestamp'])
cpu1 = cpu1.drop_duplicates(subset=['timestamp'])
print("data row after split: {}".format(cpu0.shape[0]))
print("data row after split: {}".format(cpu1.shape[0]))




05
data rows (1825813, 6)
data rows after drop_duplicates 1738076
data row from 2014-03-31 00:00:00 to 2015-08-11 00:00:00: 1506577
data row after split: 755931
data row after split: 750646
data row after split: 753059
data row after split: 750527


In [43]:
if cpu0.shape[0] < cpu1.shape[0]:
    cpu1 = cpu1[cpu1.timestamp >= cpu0['timestamp'].iloc[0]]
    
print("data row after split: {}".format(cpu0.shape[0]))
print("data row after split: {}".format(cpu1.shape[0]))

data row after split: 753059
data row after split: 750527


In [44]:
if cpu1.shape[0] < cpu0.shape[0]:
    cpu0 = cpu0[cpu0.timestamp >= cpu1['timestamp'].iloc[0]]

print("data row after split: {}".format(cpu0.shape[0]))
print("data row after split: {}".format(cpu1.shape[0]))

data row after split: 750527
data row after split: 750527


In [47]:
# convert the timestamp column to the Pandas timestamp format
cpu0.timestamp = pd.to_datetime(cpu0.timestamp)
cpu1.timestamp = pd.to_datetime(cpu1.timestamp)
# set the timestamp as the dataFrame index
cpu0 = cpu0.set_index(["timestamp"])
cpu1 = cpu1.set_index(["timestamp"])



In [48]:
# resample data from 5 sec to 1 min, using the mean for the new values
cpu0_1min = cpu0.resample('1Min').mean()
print("data rows {}".format(cpu0_1min.shape[0]))
cpu1_1min = cpu1.resample('1Min').mean()
print("data rows {}".format(cpu1_1min.shape[0]))


data rows 132878
data rows 132878


In [49]:
# remove useless columns
cpu0_1min = cpu0_1min.drop(['cpu_id'], axis=1)
cpu1_1min = cpu1_1min.drop(['cpu_id'], axis=1)

# rename columns for the concat
cpu0_1min = cpu0_1min.rename(index=str, columns={"pow_cpu": "pow_cpu_0", "pow_dram": "pow_dram_0", "pow_pkg": "pow_pkg_0"})
cpu1_1min = cpu1_1min.rename(index=str, columns={"pow_cpu": "pow_cpu_1", "pow_dram": "pow_dram_1", "pow_pkg": "pow_pkg_1"})



In [9]:
cpu0_1min.head(2)

Unnamed: 0_level_0,pow_cpu_0,pow_dram_0,pow_pkg_0
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-03-31 00:00:00,37.854714,7.200114,58.686825
2014-03-31 00:01:00,38.793317,7.16525,59.658575


In [10]:
cpu0_1min.shape[0]

133089

In [11]:
cpu1_1min.shape[0]

132878

In [None]:
# concat
node_1min = pd.concat([cpu0_1min, cpu1_1min], axis=1)

### merge the power consumption to have only two values (one for each node's cpu)
# add two new columns
node_1min['pow_tot_0'] = cpu0_1min['pow_cpu_0'] + cpu0_1min['pow_dram_0'] + cpu0_1min['pow_pkg_0'] 
node_1min['pow_tot_1'] = cpu1_1min['pow_cpu_1'] + cpu1_1min['pow_dram_1'] + cpu1_1min['pow_pkg_1']
# remove the useless columns
node_1min = node_1min.drop(['pow_cpu_0', 'pow_dram_0', 'pow_pkg_0', 'pow_cpu_1', 'pow_dram_1', 'pow_pkg_1'], axis=1)

# write
outfile = datadir + 'CPUs/' + interval_comment + "/node" + cpu + "_1min_" + interval_comment + ".csv"
node_1min.to_csv(outfile)

print("-------------------")