In [75]:
#!/usr/bin/env python3

import pandas as pd
import errno    
import os
import numpy as np
import math

from sklearn.metrics import r2_score

'''
Author: Enrico Ceccolini
    TODO write the description
'''

#datadir = "/datasets/eurora_data/db/"
datadir = "/datasets/eurora_data/db1/"

# settings
interval_comment = "WholeData"
suffix = "_1min_"
# settings Andrea
andrea_start_time = pd.to_datetime('2014-03-31')
andrea_end_time = pd.to_datetime('2014-05-01')
# settings Alina
alina_start_time = pd.to_datetime('2014-09-30')
alina_end_time = pd.to_datetime('2014-11-01')

infile_large_jobs = datadir + "CPUs/" + interval_comment + "/large_jobs_real_pow_final.csv"
interval_large_jobs = pd.read_csv(infile_large_jobs, index_col=0)

nodes 1-16 and 25-32 have a maximum frequency of 2.1GHz (CPUs-2100), nodes 17-24 have a maximum frequency of 2.2GHz (CPUs-2200), and nodes 33-64 have a maximum frequency of 3.1GHz (CPUs-3100). For the accelerators, nodes from 0 to 32 embed Intel Xeon Phi accelerators, whereas nodes from 33 to 64 embed Nvidia Kepler GPUs.

In [76]:
interval_large_jobs.head(1)

Unnamed: 0,job_id_string,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,exit_status,real_pow,runned_alone,real_pow_quality,n_2_1,n_2_2,n_3_1,gpu_req,mic_req,job_name
0,498458.node129,parallel,2014-03-31 00:30:02,2014-03-31 00:30:02,2014-03-31 01:22:40,alupi000@node129.eurora.cineca.it,1,16,14,04:00,COMPLETED,0.0,True,0.0,0,1,0,,,rth2e6t0


In [77]:
interval_large_jobs.shape

(64000, 20)

### Compute the mean core power consumption for the first type of cpu (2.1 GHz) 

In [78]:
interval_large_jobs_2_1 = interval_large_jobs[interval_large_jobs['node_req'] == interval_large_jobs['n_2_1']]
print("There are {} jobs completely running on this type of cpu".format(interval_large_jobs_2_1.shape[0]))
mean_core_2_1 = interval_large_jobs_2_1['real_pow'].sum() / interval_large_jobs_2_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_1))
interval_large_jobs_2_1['predicted'] = mean_core_2_1 * interval_large_jobs_2_1['cpu_req']
print("Accuracy: {}".format(r2_score(interval_large_jobs_2_1['real_pow'], interval_large_jobs_2_1['predicted'])))

### Esclude the jobs with low quality of approximation
quality_large_jobs_2_1 = interval_large_jobs_2_1[interval_large_jobs_2_1['real_pow'] != 0]
quality_large_jobs_2_1 = quality_large_jobs_2_1[quality_large_jobs_2_1['real_pow_quality'] != 0]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_2_1.shape[0]))
mean_core_2_1 = quality_large_jobs_2_1['real_pow'].sum() / quality_large_jobs_2_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_1))
quality_large_jobs_2_1['predicted'] = mean_core_2_1 * quality_large_jobs_2_1['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_2_1['real_pow'], quality_large_jobs_2_1['predicted'])))

### Esclude jobs that ran concurrently
quality_large_jobs_2_1 = interval_large_jobs_2_1[interval_large_jobs_2_1['real_pow'] != 0]
quality_large_jobs_2_1 = quality_large_jobs_2_1[quality_large_jobs_2_1['real_pow_quality'] != 0]
quality_large_jobs_2_1 = quality_large_jobs_2_1[quality_large_jobs_2_1['runned_alone'] == True]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_2_1.shape[0]))
mean_core_2_1 = quality_large_jobs_2_1['real_pow'].sum() / quality_large_jobs_2_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_1))
quality_large_jobs_2_1['predicted'] = mean_core_2_1 * quality_large_jobs_2_1['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_2_1['real_pow'], quality_large_jobs_2_1['predicted'])))

There are 13265 jobs completely running on this type of cpu
The average per core is 11.614662777744632 W
Accuracy: 0.604018857666945
There are 12680 jobs completely running on this type of cpu with that quality
The average per core is 12.188997397054441 W
Accuracy: 0.6961530048060831
There are 10089 jobs completely running on this type of cpu with that quality
The average per core is 11.928331731218778 W
Accuracy: 0.6596372349292074


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Andrea's interval

In [80]:
andrea_interval_large_jobs =interval_large_jobs[pd.to_datetime(interval_large_jobs['run_start_time']) >= andrea_start_time]
andrea_interval_large_jobs =andrea_interval_large_jobs[pd.to_datetime(andrea_interval_large_jobs['run_start_time']) <= andrea_end_time]

In [81]:
interval_large_jobs_2_1 = andrea_interval_large_jobs[andrea_interval_large_jobs['node_req'] == andrea_interval_large_jobs['n_2_1']]
print("There are {} jobs completely running on this type of cpu".format(interval_large_jobs_2_1.shape[0]))
mean_core_2_1 = interval_large_jobs_2_1['real_pow'].sum() / interval_large_jobs_2_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_1))
interval_large_jobs_2_1['predicted'] = mean_core_2_1 * interval_large_jobs_2_1['cpu_req']
print("Accuracy: {}".format(r2_score(interval_large_jobs_2_1['real_pow'], interval_large_jobs_2_1['predicted'])))

### Esclude the jobs with low quality of approximation
quality_large_jobs_2_1 = interval_large_jobs_2_1[interval_large_jobs_2_1['real_pow'] != 0]
quality_large_jobs_2_1 = quality_large_jobs_2_1[quality_large_jobs_2_1['real_pow_quality'] != 0]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_2_1.shape[0]))
mean_core_2_1 = quality_large_jobs_2_1['real_pow'].sum() / quality_large_jobs_2_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_1))
quality_large_jobs_2_1['predicted'] = mean_core_2_1 * quality_large_jobs_2_1['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_2_1['real_pow'], quality_large_jobs_2_1['predicted'])))

### Esclude jobs that ran concurrently
quality_large_jobs_2_1 = interval_large_jobs_2_1[interval_large_jobs_2_1['real_pow'] != 0]
quality_large_jobs_2_1 = quality_large_jobs_2_1[quality_large_jobs_2_1['real_pow_quality'] != 0]
quality_large_jobs_2_1 = quality_large_jobs_2_1[quality_large_jobs_2_1['runned_alone'] == True]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_2_1.shape[0]))
mean_core_2_1 = quality_large_jobs_2_1['real_pow'].sum() / quality_large_jobs_2_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_1))
quality_large_jobs_2_1['predicted'] = mean_core_2_1 * quality_large_jobs_2_1['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_2_1['real_pow'], quality_large_jobs_2_1['predicted'])))

There are 5064 jobs completely running on this type of cpu
The average per core is 12.143704579416486 W
Accuracy: 0.6683120957773678
There are 4900 jobs completely running on this type of cpu with that quality
The average per core is 12.529485612237833 W
Accuracy: 0.7033161874156395
There are 3450 jobs completely running on this type of cpu with that quality
The average per core is 11.986636720341757 W
Accuracy: 0.6820671373736893


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Compute the mean core power consumption for the first type of cpu (2.2 GHz) 

In [85]:
interval_large_jobs_2_2 = interval_large_jobs[interval_large_jobs['node_req'] == interval_large_jobs['n_2_2']]
print("There are {} jobs completely running on this type of cpu".format(interval_large_jobs_2_2.shape[0]))
mean_core_2_2 = interval_large_jobs_2_2['real_pow'].sum() / interval_large_jobs_2_2['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_2))
interval_large_jobs_2_2['predicted'] = mean_core_2_2 * interval_large_jobs_2_2['cpu_req']
print("Accuracy: {}".format(r2_score(interval_large_jobs_2_2['real_pow'], interval_large_jobs_2_2['predicted'])))

quality_large_jobs_2_2 = interval_large_jobs_2_2[interval_large_jobs_2_2['real_pow'] != 0]
quality_large_jobs_2_2 = quality_large_jobs_2_2[quality_large_jobs_2_2['real_pow_quality'] != 0]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_2_2.shape[0]))
mean_core_2_2 = quality_large_jobs_2_2['real_pow'].sum() / quality_large_jobs_2_2['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_2))
quality_large_jobs_2_2['predicted'] = mean_core_2_2 * quality_large_jobs_2_2['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_2_2['real_pow'], quality_large_jobs_2_2['predicted'])))

There are 4570 jobs completely running on this type of cpu
The average per core is 13.80509054541727 W
Accuracy: 0.44371127080645256
There are 4294 jobs completely running on this type of cpu with that quality
The average per core is 14.664222292746754 W
Accuracy: 0.534215782066189


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Andrea's interval

In [86]:
interval_large_jobs_2_2 = andrea_interval_large_jobs[andrea_interval_large_jobs['node_req'] == andrea_interval_large_jobs['n_2_2']]
print("There are {} jobs completely running on this type of cpu".format(interval_large_jobs_2_2.shape[0]))
mean_core_2_2 = interval_large_jobs_2_2['real_pow'].sum() / interval_large_jobs_2_2['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_2))
interval_large_jobs_2_2['predicted'] = mean_core_2_2 * interval_large_jobs_2_2['cpu_req']
print("Accuracy: {}".format(r2_score(interval_large_jobs_2_2['real_pow'], interval_large_jobs_2_2['predicted'])))

quality_large_jobs_2_2 = interval_large_jobs_2_2[interval_large_jobs_2_2['real_pow'] != 0]
quality_large_jobs_2_2 = quality_large_jobs_2_2[quality_large_jobs_2_2['real_pow_quality'] != 0]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_2_2.shape[0]))
mean_core_2_2 = quality_large_jobs_2_2['real_pow'].sum() / quality_large_jobs_2_2['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_2_2))
quality_large_jobs_2_2['predicted'] = mean_core_2_2 * quality_large_jobs_2_2['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_2_2['real_pow'], quality_large_jobs_2_2['predicted'])))

There are 1428 jobs completely running on this type of cpu
The average per core is 14.932216042263958 W
Accuracy: 0.5294936382852904
There are 1362 jobs completely running on this type of cpu with that quality
The average per core is 15.45302698203022 W
Accuracy: 0.5504581852733055


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Compute the mean core power consumption for the first type of cpu (3.1 GHz) 

In [83]:
interval_large_jobs_3_1 = interval_large_jobs[interval_large_jobs['node_req'] == interval_large_jobs['n_3_1']]
print("There are {} jobs completely running on this type of cpu".format(interval_large_jobs_3_1.shape[0]))
mean_core_3_1 = interval_large_jobs_3_1['real_pow'].sum() / interval_large_jobs_3_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_3_1))
interval_large_jobs_3_1['predicted'] = mean_core_3_1 * interval_large_jobs_3_1['cpu_req']
print("Accuracy: {}".format(r2_score(interval_large_jobs_3_1['real_pow'], interval_large_jobs_3_1['predicted'])))

quality_large_jobs_3_1 = interval_large_jobs_3_1[interval_large_jobs_3_1['real_pow'] != 0]
quality_large_jobs_3_1 = quality_large_jobs_3_1[quality_large_jobs_3_1['real_pow_quality'] != 0]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_3_1.shape[0]))
mean_core_3_1 = quality_large_jobs_3_1['real_pow'].sum() / quality_large_jobs_3_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_3_1))
quality_large_jobs_3_1['predicted'] = mean_core_3_1 * quality_large_jobs_3_1['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_3_1['real_pow'], quality_large_jobs_3_1['predicted'])))

There are 43708 jobs completely running on this type of cpu
The average per core is 24.842150667100178 W
Accuracy: 0.5806440166370469
There are 38419 jobs completely running on this type of cpu with that quality
The average per core is 27.562140558509466 W
Accuracy: 0.6072288411283724


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Andrea's interval

In [87]:
interval_large_jobs_3_1 = andrea_interval_large_jobs[andrea_interval_large_jobs['node_req'] == andrea_interval_large_jobs['n_3_1']]
print("There are {} jobs completely running on this type of cpu".format(interval_large_jobs_3_1.shape[0]))
mean_core_3_1 = interval_large_jobs_3_1['real_pow'].sum() / interval_large_jobs_3_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_3_1))
interval_large_jobs_3_1['predicted'] = mean_core_3_1 * interval_large_jobs_3_1['cpu_req']
print("Accuracy: {}".format(r2_score(interval_large_jobs_3_1['real_pow'], interval_large_jobs_3_1['predicted'])))

quality_large_jobs_3_1 = interval_large_jobs_3_1[interval_large_jobs_3_1['real_pow'] != 0]
quality_large_jobs_3_1 = quality_large_jobs_3_1[quality_large_jobs_3_1['real_pow_quality'] != 0]
print("There are {} jobs completely running on this type of cpu with that quality".format(quality_large_jobs_3_1.shape[0]))
mean_core_3_1 = quality_large_jobs_3_1['real_pow'].sum() / quality_large_jobs_3_1['cpu_req'].sum()
print("The average per core is {} W".format(mean_core_3_1))
quality_large_jobs_3_1['predicted'] = mean_core_3_1 * quality_large_jobs_3_1['cpu_req']
print("Accuracy: {}".format(r2_score(quality_large_jobs_3_1['real_pow'], quality_large_jobs_3_1['predicted'])))

There are 15587 jobs completely running on this type of cpu
The average per core is 22.25328132443491 W
Accuracy: 0.5531975794006317
There are 13793 jobs completely running on this type of cpu with that quality
The average per core is 25.079102763361274 W
Accuracy: 0.6538735694817891


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Debug

In [32]:
interval_large_jobs = interval_large_jobs[interval_large_jobs['real_pow_quality'] != 0]
interval_large_jobs.shape

(59542, 19)

In [33]:
interval_large_jobs = interval_large_jobs[interval_large_jobs['real_pow'] != 0]
interval_large_jobs.shape

(57658, 19)

In [35]:
interval_large_jobs_2_1 = interval_large_jobs[interval_large_jobs['node_req'] == interval_large_jobs['n_2_1']]

In [36]:
interval_large_jobs_2_1.shape

(12680, 19)

In [37]:
mean_core_2_1 = interval_large_jobs_2_1['real_pow'].sum() / interval_large_jobs_2_1['cpu_req'].sum()
print(mean_core_2_1)

In [39]:
interval_large_jobs_2_1['predicted'] = mean_core_2_1 * interval_large_jobs_2_1['cpu_req']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
#interval_large_jobs_2_1

In [41]:
print("approximation accuracy: {}".format(r2_score(interval_large_jobs_2_1['real_pow'], interval_large_jobs_2_1['predicted'])))

approximation accuracy: 0.6961530048060831


(12327, 20)

In [44]:
interval_large_jobs_2_1_top_quality = interval_large_jobs_2_1[interval_large_jobs_2_1['real_pow_quality'] == 1]
interval_large_jobs_2_1_top_quality.shape
mean_core_2_1_top_quality = interval_large_jobs_2_1['real_pow'].sum() / interval_large_jobs_2_1['cpu_req'].sum()
print(mean_core_2_1_top_quality)
interval_large_jobs_2_1_top_quality['predicted'] = mean_core_2_1_top_quality * interval_large_jobs_2_1_top_quality['cpu_req']
print("approximation accuracy: {}".format(r2_score(interval_large_jobs_2_1_top_quality['real_pow'], interval_large_jobs_2_1_top_quality['predicted'])))

12.188997397054441
approximation accuracy: 0.6984105718416773


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
