In [12]:
#!/usr/bin/env python3

import pandas as pd
import errno    
import os
import numpy as np
import math

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from scipy import stats

pd.set_option('display.max_rows', 4000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

'''
Author: Enrico Ceccolini
    TODO write the description
'''

#datadir = "/datasets/eurora_data/db/"
datadir = "/datasets/eurora_data/db1/"

# settings
interval_comment = "WholeData"
suffix = "_1min_"
# settings Andrea
andrea_start_time = pd.to_datetime('2014-03-31')
andrea_end_time = pd.to_datetime('2014-05-01')
# settings Alina
alina_start_time = pd.to_datetime('2014-09-30')
alina_end_time = pd.to_datetime('2014-11-01')

infile_large_jobs = datadir + "CPUs/" + interval_comment + "/large_jobs_real_pow_final.csv"
interval_large_jobs = pd.read_csv(infile_large_jobs, index_col=0)

nodes 1-16 and 25-32 have a maximum frequency of 2.1GHz (CPUs-2100), nodes 17-24 have a maximum frequency of 2.2GHz (CPUs-2200), and nodes 33-64 have a maximum frequency of 3.1GHz (CPUs-3100). For the accelerators, nodes from 0 to 32 embed Intel Xeon Phi accelerators, whereas nodes from 33 to 64 embed Nvidia Kepler GPUs.

In [13]:
interval_large_jobs.head(1)

Unnamed: 0,job_id_string,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,exit_status,real_pow,runned_alone,real_pow_quality,n_2_1,n_2_2,n_3_1,gpu_req,mic_req,job_name
0,498458.node129,parallel,2014-03-31 00:30:02,2014-03-31 00:30:02,2014-03-31 01:22:40,alupi000@node129.eurora.cineca.it,1,16,14,04:00,COMPLETED,0.0,True,0.0,0,1,0,,,rth2e6t0


In [14]:
interval_large_jobs.shape

(64000, 20)

In [15]:
interval_large_jobs = interval_large_jobs[interval_large_jobs['real_pow'] != 0]
interval_large_jobs = interval_large_jobs[interval_large_jobs['real_pow_quality'] != 0]

### Compute the mean core power consumption for the first type of cpu (2.1 GHz) 

In [16]:
grouped_user_data = interval_large_jobs.groupby('user')['job_id_string'].nunique().reset_index(name='counts')
print("{} distinct users".format(grouped_user_data.shape[0]))
grouped_user_data['count_2_1'] = 0
grouped_user_data['R-squared_2_1'] = 0.0
grouped_user_data['count_2_2'] = 0
grouped_user_data['R-squared_2_2'] = 0.0
grouped_user_data['count_3_1'] = 0
grouped_user_data['R-squared_3_1'] = 0.0

498 distinct users


In [17]:
grouped_user_data['counts'].sum()

57658

In [18]:
i = 0
for node_index, node_row in grouped_user_data.iterrows():
    user_jobs = interval_large_jobs[interval_large_jobs['user'] == node_row['user']]
    user_jobs_2_1 = user_jobs[user_jobs['node_req'] == user_jobs['n_2_1']]
    user_jobs_2_2 = user_jobs[user_jobs['node_req'] == user_jobs['n_2_2']]
    user_jobs_3_1 = user_jobs[user_jobs['node_req'] == user_jobs['n_3_1']]
    
    mean_core_2_1 = user_jobs_2_1['real_pow'].sum() / user_jobs_2_1['cpu_req'].sum()
    mean_core_2_2 = user_jobs_2_2['real_pow'].sum() / user_jobs_2_2['cpu_req'].sum()
    mean_core_3_1 = user_jobs_3_1['real_pow'].sum() / user_jobs_3_1['cpu_req'].sum()
    
    print(mean_core_2_1)
    print(mean_core_2_2)
    print(mean_core_3_1)
    
    user_jobs_2_1['predicted'] = mean_core_2_1 * user_jobs_2_1['cpu_req']
    user_jobs_2_2['predicted'] = mean_core_2_2 * user_jobs_2_2['cpu_req']
    user_jobs_3_1['predicted'] = mean_core_3_1 * user_jobs_3_1['cpu_req']
    
    if(user_jobs_2_1.shape[0] != 0):
        slope, intercept, r_value, p_value, std_err = stats.linregress(user_jobs_2_1['real_pow'], user_jobs_2_1['predicted'])
        grouped_user_data.loc[node_index, 'R-squared_2_1'] = r_value**2
        grouped_user_data.loc[node_index, 'count_2_1'] = user_jobs_2_1.shape[0]
    if(user_jobs_2_2.shape[0] != 0):
        slope, intercept, r_value, p_value, std_err = stats.linregress(user_jobs_2_2['real_pow'], user_jobs_2_2['predicted'])
        grouped_user_data.loc[node_index, 'R-squared_2_2'] = r_value**2
        grouped_user_data.loc[node_index, 'count_2_2'] = user_jobs_2_2.shape[0]
    if(user_jobs_3_1.shape[0] != 0):
        slope, intercept, r_value, p_value, std_err = stats.linregress(user_jobs_3_1['real_pow'], user_jobs_3_1['predicted'])
        grouped_user_data.loc[node_index, 'R-squared_3_1'] = r_value**2
        grouped_user_data.loc[node_index, 'count_3_1'] = user_jobs_3_1.shape[0]
        
    #if(user_jobs_2_1.shape[0] != 0):
    #    grouped_user_data.loc[node_index, 'accuracy_2_1'] = mean_squared_error(user_jobs_2_1['real_pow'], user_jobs_2_1['predicted'])
    #    grouped_user_data.loc[node_index, 'count_2_1'] = user_jobs_2_1.shape[0]
    #if(user_jobs_2_2.shape[0] != 0):
    #    grouped_user_data.loc[node_index, 'accuracy_2_2'] = mean_squared_error(user_jobs_2_2['real_pow'], user_jobs_2_2['predicted'])
    #    grouped_user_data.loc[node_index, 'count_2_2'] = user_jobs_2_2.shape[0]
    #if(user_jobs_3_1.shape[0] != 0):
    #    grouped_user_data.loc[node_index, 'accuracy_3_1'] = mean_squared_error(user_jobs_3_1['real_pow'], user_jobs_3_1['predicted'])
    #    grouped_user_data.loc[node_index, 'count_3_1'] = user_jobs_3_1.shape[0]
    
    i += 1
    print(i)

  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


12.1391586392346
nan
21.35730472198571
1
nan
nan
18.762206307725965
2
nan
nan
22.672278992955565
3
nan
nan
19.295763794626996
4
nan
nan
19.714580592222948
5
nan
nan
23.086447424342815
6
nan
nan
22.106415512487956
7
nan
nan
16.81077481978646
8
nan
nan
2.7399368043400316
9
nan
nan
1.1692782958009857
10
nan
nan
17.53319991821972
11
nan
nan
31.804247398153898
12
3.49728105148881
nan
nan
13
nan
nan
32.812139089677736
14
nan
nan
226.679564342583
15
nan
nan
16.499551426490687
16
nan
nan
25.316263040585888
17
nan
nan
73.31563696656676
18
9.628979280624115
nan
94.11905111038314


  # Remove the CWD from sys.path while we load stuff.


19
13.011600851697075
nan
62.196063900808
20
nan
nan
103.63228539507901
21
nan
nan
21.3221592125992
22
11.6518318342818
11.204335355922801
3.55494463928772


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


23
5.006923737648547
14.730855685115
4.447590178870636
24
6.413953276050124
7.051378693286225
20.951784161398933
25
nan
nan
3.42957680976483
26
nan
nan
17.1983828417855
27
7.528987090492753
12.051424608806201
13.508431399280434
28
nan
nan
28.1338520241861
29
34.99201324346707
6.32822914154515
13.665402779979374
30
nan
nan
3.7802207875609453
31
3.4640428525157336
nan
3.6254539125127874
32
nan
nan
2.662088259437707
33
5.864689564960794
nan
5.397815722425651
34
4.44238560393763
nan
5.373342142435122
35
3.516500573272045
nan
2.7748149850334034
36
nan
nan
3.26762948447349
37
nan
nan
4.6609724266695105
38
4.708194826359016
nan
nan
39
nan
nan
2.58541899611579
40
nan
nan
2.97434147659124
41
3.1967123327435303
nan
5.836925122189625
42
3.38571766927041
nan
3.6988781412103777
43
nan
nan
4.058498805392915
44
nan
nan
3.5333349491197112
45
nan
nan
4.11412139301008
46
nan
nan
4.79375519805541
47
nan
nan
50.50071499202934
48
2.8516712049353004
nan
3.379099216432235
49
4.263926025398532
nan
4.518394556

236
nan
15.26684141925165
39.883496174672075
237
12.308453035568574
12.24133555847737
93.00609547920642
238
nan
23.540383227365627
21.283057892069987
239
57.53514251336395
59.45715745612736
nan
240
nan
nan
28.058689737554047
241
nan
nan
7.0068486899228795
242
15.056076707156892
19.54415553808879
32.158950736133846
243
13.81538392419186
13.728442051530969
27.1120398879058
244
nan
nan
18.806997323209423
245
nan
nan
15.77710743035677
246
nan
nan
12.949236483055937
247
nan
nan
11.064696517288425
248
nan
nan
25.505569459202444
249
nan
nan
27.24808402216466
250
nan
nan
7.139281501421437
251
nan
nan
17.13275409178684
252
nan
nan
12.122435739825209
253
nan
nan
13.726270502169635
254
nan
nan
22.48784127580087
255
nan
nan
7.03650160380873
256
nan
nan
21.9223691472388
257
nan
nan
17.31006828100422
258
nan
nan
24.16862294937048
259
nan
nan
7.366487601243954
260
nan
nan
5.151189727284041
261
nan
nan
13.047680057276896
262
nan
nan
15.500389056295408
263
nan
nan
9.80187294281843
264
nan
nan
7.7010373

451
14.962669206074514
18.970868600984286
33.93575073136452
452
nan
28.131141490637496
28.105946191434388
453
14.464608288928893
nan
5.292653366427417
454
nan
nan
6.964853297213031
455
4.979165811477319
7.214702917737322
8.760001776885398
456
15.092109502421533
15.775617787833516
34.94164154352653
457
nan
nan
3.635731156175731
458
nan
nan
26.316588756646443
459
nan
nan
30.28040658078648
460
nan
nan
92.73873185507826
461
nan
nan
95.75808619548641
462
nan
nan
98.81554770850485
463
nan
nan
99.50911262424044
464
nan
nan
132.62919272943378
465
nan
nan
100.9222781091195
466
nan
nan
93.94570580181276
467
nan
nan
89.16341805244583
468
nan
nan
87.91923888878918
469
nan
nan
102.09650787301827
470
nan
nan
82.58122154780557
471
nan
nan
105.49271866354304
472
nan
nan
93.8307198235339
473
nan
nan
96.9484775908073
474
nan
nan
41.761403019845
475
nan
nan
117.10283171303644
476
nan
nan
90.63049326249217
477
nan
nan
106.01515115187999
478
nan
nan
92.9838735582586
479
nan
nan
107.35294429119703
480
nan
n

In [19]:
grouped_user_data['R-squared_2_1'] = grouped_user_data['R-squared_2_1'].round(6)
grouped_user_data['R-squared_2_2'] = grouped_user_data['R-squared_2_2'].round(6)
grouped_user_data['R-squared_3_1'] = grouped_user_data['R-squared_3_1'].round(6)

In [20]:
grouped_user_data[grouped_user_data['count_3_1'] != 1].sort_values('counts', ascending=False)

Unnamed: 0,user,counts,count_2_1,R-squared_2_1,count_2_2,R-squared_2_2,count_3_1,R-squared_3_1
129,asaetti0@node129.eurora.cineca.it,9216,0,0.0,337,0.0,8879,0.0
220,fpetracc@node129.eurora.cineca.it,2684,1595,0.0,683,0.0,406,0.0
218,fnicolus@node129.eurora.cineca.it,1763,10,0.0,2,0.0,1751,5e-06
217,fnegreir@node129.eurora.cineca.it,1735,413,0.934622,171,0.639407,1148,0.874153
319,mdistefa@node129.eurora.cineca.it,1689,506,0.994646,129,0.585681,1054,0.993419
488,tbusatta@node129.eurora.cineca.it,1674,646,0.0,204,0.0,824,0.0
320,mdotti00@node129.eurora.cineca.it,1575,979,0.731976,327,0.099297,269,0.005865
348,mpalummo@node129.eurora.cineca.it,1349,483,0.830202,273,0.706958,479,0.473446
191,emarcucc@node129.eurora.cineca.it,1286,378,0.0,28,0.0,880,0.0
234,gbertain@node129.eurora.cineca.it,1262,580,0.848754,131,0.192991,520,0.547021


### Andrea's interval

In [21]:
andrea_interval_large_jobs =interval_large_jobs[pd.to_datetime(interval_large_jobs['run_start_time']) >= andrea_start_time]
andrea_interval_large_jobs =andrea_interval_large_jobs[pd.to_datetime(andrea_interval_large_jobs['run_start_time']) <= andrea_end_time]

In [22]:
grouped_user_data = andrea_interval_large_jobs.groupby('user')['job_id_string'].nunique().reset_index(name='counts')
print("{} distinct users".format(grouped_user_data.shape[0]))
grouped_user_data['count_2_1'] = 0
grouped_user_data['R-squared_2_1'] = 0.0
grouped_user_data['count_2_2'] = 0
grouped_user_data['R-squared_2_2'] = 0.0
grouped_user_data['count_3_1'] = 0
grouped_user_data['R-squared_3_1'] = 0.0

222 distinct users


In [23]:
grouped_user_data['counts'].sum()

20356

In [24]:
i = 0
for node_index, node_row in grouped_user_data.iterrows():
    user_jobs = interval_large_jobs[interval_large_jobs['user'] == node_row['user']]
    user_jobs_2_1 = user_jobs[user_jobs['node_req'] == user_jobs['n_2_1']]
    user_jobs_2_2 = user_jobs[user_jobs['node_req'] == user_jobs['n_2_2']]
    user_jobs_3_1 = user_jobs[user_jobs['node_req'] == user_jobs['n_3_1']]
    
    mean_core_2_1 = user_jobs_2_1['real_pow'].sum() / user_jobs_2_1['cpu_req'].sum()
    mean_core_2_2 = user_jobs_2_2['real_pow'].sum() / user_jobs_2_2['cpu_req'].sum()
    mean_core_3_1 = user_jobs_3_1['real_pow'].sum() / user_jobs_3_1['cpu_req'].sum()
    
    print(mean_core_2_1)
    print(mean_core_2_2)
    print(mean_core_3_1)
    
    user_jobs_2_1['predicted'] = mean_core_2_1 * user_jobs_2_1['cpu_req']
    user_jobs_2_2['predicted'] = mean_core_2_2 * user_jobs_2_2['cpu_req']
    user_jobs_3_1['predicted'] = mean_core_3_1 * user_jobs_3_1['cpu_req']
    
    if(user_jobs_2_1.shape[0] != 0):
        slope, intercept, r_value, p_value, std_err = stats.linregress(user_jobs_2_1['real_pow'], user_jobs_2_1['predicted'])
        grouped_user_data.loc[node_index, 'R-squared_2_1'] = r_value**2
        grouped_user_data.loc[node_index, 'count_2_1'] = user_jobs_2_1.shape[0]
    if(user_jobs_2_2.shape[0] != 0):
        slope, intercept, r_value, p_value, std_err = stats.linregress(user_jobs_2_2['real_pow'], user_jobs_2_2['predicted'])
        grouped_user_data.loc[node_index, 'R-squared_2_2'] = r_value**2
        grouped_user_data.loc[node_index, 'count_2_2'] = user_jobs_2_2.shape[0]
    if(user_jobs_3_1.shape[0] != 0):
        slope, intercept, r_value, p_value, std_err = stats.linregress(user_jobs_3_1['real_pow'], user_jobs_3_1['predicted'])
        grouped_user_data.loc[node_index, 'R-squared_3_1'] = r_value**2
        grouped_user_data.loc[node_index, 'count_3_1'] = user_jobs_3_1.shape[0]
        
    #if(user_jobs_2_1.shape[0] != 0):
    #    grouped_user_data.loc[node_index, 'accuracy_2_1'] = mean_squared_error(user_jobs_2_1['real_pow'], user_jobs_2_1['predicted'])
    #    grouped_user_data.loc[node_index, 'count_2_1'] = user_jobs_2_1.shape[0]
    #if(user_jobs_2_2.shape[0] != 0):
    #    grouped_user_data.loc[node_index, 'accuracy_2_2'] = mean_squared_error(user_jobs_2_2['real_pow'], user_jobs_2_2['predicted'])
    #    grouped_user_data.loc[node_index, 'count_2_2'] = user_jobs_2_2.shape[0]
    #if(user_jobs_3_1.shape[0] != 0):
    #    grouped_user_data.loc[node_index, 'accuracy_3_1'] = mean_squared_error(user_jobs_3_1['real_pow'], user_jobs_3_1['predicted'])
    #    grouped_user_data.loc[node_index, 'count_3_1'] = user_jobs_3_1.shape[0]
    
    i += 1
    print(i)

  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


12.1391586392346
nan
21.35730472198571
1
nan
nan
18.762206307725965
2
nan
nan
22.672278992955565
3
nan
nan
19.295763794626996
4
nan
nan
19.714580592222948
5
nan
nan
22.106415512487956
6
nan
nan
16.81077481978646
7
nan
nan
17.53319991821972
8
11.6518318342818
11.204335355922801
3.55494463928772


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


9
5.006923737648547
14.730855685115
4.447590178870636
10
6.413953276050124
7.051378693286225
20.951784161398933
11
nan
nan
3.42957680976483
12
nan
nan
17.1983828417855
13
7.528987090492753
12.051424608806201
13.508431399280434
14
nan
nan
28.1338520241861
15
34.99201324346707
6.32822914154515
13.665402779979374
16
nan
28.39008734695413
33.528822958881385
17
14.31855553262878
14.961477194091904
36.473324550941136
18
15.262193839044393
nan
25.00217860178559
19
nan
nan
28.862114814824967
20
5.380433408457281
5.011106244062281
9.583990962388881
21
nan
28.104569162150284
27.962262185106113
22
nan
nan
7.013150958500999
23
nan
nan
3.562976038689833
24
nan
nan
28.327738780900912
25
14.1912060247574
nan
37.5425977388221
26
12.200634654815813
15.565029078557414
26.392016793725748
27
nan
nan
11.071226545395204
28
nan
28.250836028315124
29.359807190598886
29
15.024782571158624
18.61921110335305
33.29858713248382
30
nan
nan
nan
31
13.331451662945208
nan
29.748612214825002
32
nan
nan
36.0244947258980

  # Remove the CWD from sys.path while we load stuff.


nan
58.73447192188741
42.80016205661354
35
15.254464832005013
22.13000090547486
32.571991144779616
36
nan
nan
39.74970236930366
37
13.504747916058912
nan
27.37183164095953
38
nan
nan
10.614723826290449
39
nan
nan
55.31441026172266
40
nan
nan
27.93335768225084
41
7.67847298980835
7.781678197663406
21.7253457892458
42
11.5541863843591
12.335922302136765
25.988901450390635
43
nan
nan
10.189002742851438
44
12.865106218913635
nan
12.896407923252635
45
nan
nan
26.24817650603805
46
13.069697101501312
13.510246508208564
33.46543386460839
47
12.691259800079004
14.406470815253826
23.402995287658133
48
nan
nan
nan
49
4.74695402918904
4.411807259244391
7.684173531939172
50
10.602628659179295
15.296603046867386
45.09527184773964
51
13.95624704217677
14.653873227535541
28.75070340095564
52
34.56261999747681
32.020093279089835
60.86591073037505
53
17.93943550024586
35.352717144638284
36.85846885585652
54
nan
27.53984617527831
27.719500424292693
55
nan
nan
42.80786079192688
56
15.71739064202959
16.102

In [26]:
grouped_user_data['R-squared_2_1'] = grouped_user_data['R-squared_2_1'].round(6)
grouped_user_data['R-squared_2_2'] = grouped_user_data['R-squared_2_2'].round(6)
grouped_user_data['R-squared_3_1'] = grouped_user_data['R-squared_3_1'].round(6)

In [27]:
grouped_user_data[grouped_user_data['count_3_1'] != 1].sort_values('counts', ascending=False)

Unnamed: 0,user,counts,count_2_1,R-squared_2_1,count_2_2,R-squared_2_2,count_3_1,R-squared_3_1
34,asaetti0@node129.eurora.cineca.it,5064,0,0.0,337,0.0,8879,0.0
215,tbusatta@node129.eurora.cineca.it,1674,646,0.0,204,0.0,824,0.0
72,fpetracc@node129.eurora.cineca.it,1532,1595,0.0,683,0.0,406,0.0
57,emarcucc@node129.eurora.cineca.it,1286,378,0.0,28,0.0,880,0.0
52,dmelazzi@node129.eurora.cineca.it,636,291,0.0,75,0.0,270,0.0
95,jfranz00@node129.eurora.cineca.it,563,179,0.923375,82,0.774245,303,0.555453
70,fnegreir@node129.eurora.cineca.it,516,413,0.934622,171,0.639407,1148,0.874153
120,mmapelli@node129.eurora.cineca.it,483,48,0.307395,17,0.205654,804,0.521604
142,nspalla1@node129.eurora.cineca.it,461,222,0.0,63,0.0,188,0.956664
115,mdistefa@node129.eurora.cineca.it,442,506,0.994646,129,0.585681,1054,0.993419
