In [45]:
#!/usr/bin/env python3

import pandas as pd
import errno    
import os

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

'''
Author: Enrico Ceccolini
    Show some dataset problems
'''

datadir = "/datasets/eurora_data/db/"
infile_jobs_to_nodes = datadir + "job_nodes.csv"
infile_jobs = datadir + "jobs.csv"

In [46]:
# obtain the id of all jobs runned in this node in the past
jobs_to_nodes_whole_data = pd.read_csv(infile_jobs_to_nodes)
print("jobs_to_nodes_whole_data contains {} records".format(jobs_to_nodes_whole_data.shape[0]))

jobs_whole_data = pd.read_csv(infile_jobs)
print("jobs_whole_data contains {} records".format(jobs_whole_data.shape[0]))

jobs_to_nodes_whole_data contains 469095 records
jobs_whole_data contains 405771 records


  interactivity=interactivity, compiler=compiler, result=result)


In [47]:
jobs_to_nodes_whole_data.head(2)

Unnamed: 0,job2node_id,node_id,job_id_string,ncpus,ngpus,nmics,mem_requested
0,1,17,498458.node129,16,0,0,14680064
1,2,9,498459.node129,16,0,0,14680064


### What is the node_id 129?

Looking to the jobs_to_nodes table, we have 75 jobs runned on the node with ID 129.
What is the node 129? Can I simply drop these rows?

In [48]:
jobs_node_129 = jobs_to_nodes_whole_data[jobs_to_nodes_whole_data['node_id'] == 129]
jobs_node_129

Unnamed: 0,job2node_id,node_id,job_id_string,ncpus,ngpus,nmics,mem_requested
205653,205654,129,935758.node129,1,0,0,2097152
205853,205854,129,935931.node129,1,0,0,2097152
205855,205856,129,935933.node129,1,0,0,2097152
206363,206364,129,936221.node129,1,0,0,2097152
207615,207616,129,936616.node129,1,0,0,2097152
207874,207875,129,936828.node129,1,0,0,2097152
208459,208460,129,937418.node129,1,0,0,2097152
208707,208708,129,937609.node129,1,0,0,2097152
213289,213290,129,942184.node129,1,0,0,2097152
221919,221920,129,955125.node129,1,0,0,2097152


In [49]:
print("jobs_node_129 contains {} records".format(jobs_node_129.shape[0]))

jobs_node_129 contains 75 records


#### different entries for the same job? look at the mem_requested

In [50]:
jobs_node_129.groupby(['mem_requested']).size().reset_index(name='counts')

Unnamed: 0,mem_requested,counts
0,524288,24
1,2097152,51


### Why there are jobs (with the same ID) runned in the same node 

Looking to the jobs_to_nodes table, we have 

In [51]:
df = jobs_to_nodes_whole_data.groupby(['job_id_string','node_id']).size().reset_index(name='counts')
df = df.sort_values('counts')
df = df[df['counts'] > 1]
df = df[df['node_id'] == 1]
df

Unnamed: 0,job_id_string,node_id,counts
28614,1200006.node129,1,2
117042,1317035.node129,1,2
117045,1317037.node129,1,2
60188,1234429.node129,1,2
60175,1234423.node129,1,2
334647,804561.node129,1,2
286054,586808.node129,1,2
286062,586816.node129,1,2
35594,1209879.node129,1,2
35578,1209857.node129,1,2


#### for example the job 1008449.node129 has multiple entries in the same node

In [52]:
jobs_to_nodes_whole_data[jobs_to_nodes_whole_data['job_id_string'] == '1008449.node129']

Unnamed: 0,job2node_id,node_id,job_id_string,ncpus,ngpus,nmics,mem_requested
227636,227637,33,1008449.node129,1,0,0,2097152
227637,227638,33,1008449.node129,1,0,0,2097152
227638,227639,33,1008449.node129,1,0,0,2097152
227639,227640,33,1008449.node129,1,0,0,2097152
227640,227641,33,1008449.node129,1,0,0,2097152
227641,227642,33,1008449.node129,1,0,0,2097152
227642,227643,33,1008449.node129,1,0,0,2097152
227643,227644,34,1008449.node129,1,0,0,2097152
227644,227645,34,1008449.node129,1,0,0,2097152
227645,227646,34,1008449.node129,1,0,0,2097152


In [53]:
jobs_whole_data[jobs_whole_data['job_id_string'] == '1008449.node129'] 

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
205160,207183,1008449.node129,xuGa2O3_PH,debug,2014-08-16 16:16:57,2014-08-16 16:16:58,2014-08-16 16:17:02,gcocco00@node129.eurora.cineca.it,0,0,0,,,,COMPLETED,


In [54]:
jobs_whole_data.shape

(405771, 16)

In [55]:
node_req_0

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
2227,2228,502660.node129,abq_test,debug,2014-04-01 17:23:21,2014-04-01 21:58:20,2014-04-01 21:58:30,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2228,2229,502661.node129,abq_test,debug,2014-04-01 17:26:04,2014-04-01 21:58:20,2014-04-01 21:58:41,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2229,2230,502667.node129,abq_test,debug,2014-04-01 17:29:49,2014-04-01 21:58:20,2014-04-01 21:58:30,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2230,2231,502668.node129,abq_test,debug,2014-04-01 17:31:32,2014-04-01 21:58:20,2014-04-01 21:58:28,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2231,2232,502669.node129,abq_test,debug,2014-04-01 17:43:30,2014-04-01 21:58:20,2014-04-01 21:59:12,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2232,2233,502670.node129,abq_test,debug,2014-04-01 17:45:33,2014-04-01 21:58:21,2014-04-01 21:58:28,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2234,2235,502672.node129,abq_test,debug,2014-04-01 17:49:05,2014-04-01 21:58:22,2014-04-01 21:58:41,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2235,2236,502673.node129,abq_test,debug,2014-04-01 17:54:17,2014-04-01 21:58:22,2014-04-01 21:58:31,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2236,2237,502674.node129,abq_test,debug,2014-04-01 17:54:49,2014-04-01 21:58:23,2014-04-01 21:58:28,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2237,2238,502675.node129,abq_test,debug,2014-04-01 17:55:34,2014-04-01 21:58:24,2014-04-01 21:58:30,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,


### Incomplete jobs

In [60]:
# incompleted_jobs_data = jobs_whole_data[jobs_whole_data.exit_status != 'COMPLETED']
incompleted_jobs_data = jobs_whole_data[jobs_whole_data.end_time == '0000-00-00 00:00:00']
print("incompleted_jobs_data contains {} records".format(incompleted_jobs_data.shape[0]))

incompleted_jobs_data contains 900 records


In [61]:
incompleted_jobs_data.sort_values('end_time')

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
6576,6578,508345.node129,pol5CBn500_LJtu,parallel,2014-04-03 12:42:51,2014-04-06 15:18:01,0000-00-00 00:00:00,lmucciol@node129.eurora.cineca.it,1,8,4,04:00,,,RUN,
359121,361144,1560013.node129,script3.sh,parallel,2015-01-12 12:04:28,2015-01-13 08:16:44,0000-00-00 00:00:00,ccaddeo0@node129.eurora.cineca.it,1,16,14,04:00,,,RUN,
359124,361147,1560017.node129,script3.sh,parallel,2015-01-12 12:04:28,2015-01-13 08:53:20,0000-00-00 00:00:00,ccaddeo0@node129.eurora.cineca.it,1,16,14,04:00,,,RUN,
359127,361150,1560021.node129,script3.sh,parallel,2015-01-12 12:04:29,2015-01-13 09:12:21,0000-00-00 00:00:00,ccaddeo0@node129.eurora.cineca.it,1,16,14,04:00,,,RUN,
359130,361153,1560026.node129,he_h2o_031,parallel,2015-01-12 12:10:10,2015-01-13 09:23:52,0000-00-00 00:00:00,gtiana00@node129.eurora.cineca.it,5,80,70,04:00,,,RUN,
359167,361190,1560086.node129,e1nuba_119.csh,parallel,2015-01-12 15:43:01,2015-01-13 15:07:09,0000-00-00 00:00:00,cmorgill@node129.eurora.cineca.it,1,16,1,04:00,,,RUN,
359448,361471,1560370.node129,run_test_restart.sh,parallel,2015-01-12 17:07:18,2015-01-13 13:38:33,0000-00-00 00:00:00,gcostan1@node129.eurora.cineca.it,1,16,4,04:00,,,RUN,
359461,361484,1560400.node129,test.1njq,parallel,2015-01-12 17:35:50,2015-01-13 17:21:39,0000-00-00 00:00:00,ldegioia@node129.eurora.cineca.it,1,8,2,04:00,,,RUN,
359468,361491,1560410.node129,test.1njq,parallel,2015-01-12 17:36:13,2015-01-13 17:29:49,0000-00-00 00:00:00,ldegioia@node129.eurora.cineca.it,1,8,2,04:00,,,RUN,
359475,361498,1560420.node129,test.1njq,parallel,2015-01-12 17:36:50,2015-01-13 17:29:50,0000-00-00 00:00:00,ldegioia@node129.eurora.cineca.it,1,8,2,04:00,,,RUN,


## jobs with 0 resources req

In [44]:
node_req_0 = jobs_whole_data[jobs_whole_data['node_req'] == 0] 
node_req_0 = node_req_0[node_req_0['cpu_req'] == 0] 
node_req_0.shape

print("{} / {} jobs without resources request".format(node_req_0.shape[0], jobs_whole_data.shape[0]))

326060 / 405771 jobs without resources request


In [42]:
node_req_0

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
2227,2228,502660.node129,abq_test,debug,2014-04-01 17:23:21,2014-04-01 21:58:20,2014-04-01 21:58:30,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2228,2229,502661.node129,abq_test,debug,2014-04-01 17:26:04,2014-04-01 21:58:20,2014-04-01 21:58:41,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2229,2230,502667.node129,abq_test,debug,2014-04-01 17:29:49,2014-04-01 21:58:20,2014-04-01 21:58:30,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2230,2231,502668.node129,abq_test,debug,2014-04-01 17:31:32,2014-04-01 21:58:20,2014-04-01 21:58:28,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2231,2232,502669.node129,abq_test,debug,2014-04-01 17:43:30,2014-04-01 21:58:20,2014-04-01 21:59:12,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2232,2233,502670.node129,abq_test,debug,2014-04-01 17:45:33,2014-04-01 21:58:21,2014-04-01 21:58:28,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2234,2235,502672.node129,abq_test,debug,2014-04-01 17:49:05,2014-04-01 21:58:22,2014-04-01 21:58:41,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2235,2236,502673.node129,abq_test,debug,2014-04-01 17:54:17,2014-04-01 21:58:22,2014-04-01 21:58:31,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2236,2237,502674.node129,abq_test,debug,2014-04-01 17:54:49,2014-04-01 21:58:23,2014-04-01 21:58:28,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
2237,2238,502675.node129,abq_test,debug,2014-04-01 17:55:34,2014-04-01 21:58:24,2014-04-01 21:58:30,amessina@node129.eurora.cineca.it,0,0,0,00:10,,,COMPLETED,
