In [19]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import errno    
import os

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

'''
Author: Enrico Ceccolini
    Show some dataset problems
'''

datadir = "/datasets/eurora_data/db/"
infile_jobs_to_nodes = datadir + "job_nodes.csv"
infile_jobs = datadir + "jobs.csv"

In [2]:
# obtain the id of all jobs runned in this node in the past
jobs_to_nodes_whole_data = pd.read_csv(infile_jobs_to_nodes)
print("jobs_to_nodes_whole_data contains {} records".format(jobs_to_nodes_whole_data.shape[0]))

jobs_whole_data = pd.read_csv(infile_jobs)
print("jobs_whole_data contains {} records".format(jobs_whole_data.shape[0]))

jobs_to_nodes_whole_data contains 469095 records
jobs_whole_data contains 405771 records


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
jobs_to_nodes_whole_data.head(2)

Unnamed: 0,job2node_id,node_id,job_id_string,ncpus,ngpus,nmics,mem_requested
0,1,17,498458.node129,16,0,0,14680064
1,2,9,498459.node129,16,0,0,14680064


### What is the node_id 129?

Looking to the jobs_to_nodes table, we have 75 jobs runned on the node with ID 129.
What is the node 129? Can I simply drop these rows?

In [4]:
jobs_node_129 = jobs_to_nodes_whole_data[jobs_to_nodes_whole_data['node_id'] == 129]
jobs_node_129

Unnamed: 0,job2node_id,node_id,job_id_string,ncpus,ngpus,nmics,mem_requested
205653,205654,129,935758.node129,1,0,0,2097152
205853,205854,129,935931.node129,1,0,0,2097152
205855,205856,129,935933.node129,1,0,0,2097152
206363,206364,129,936221.node129,1,0,0,2097152
207615,207616,129,936616.node129,1,0,0,2097152
207874,207875,129,936828.node129,1,0,0,2097152
208459,208460,129,937418.node129,1,0,0,2097152
208707,208708,129,937609.node129,1,0,0,2097152
213289,213290,129,942184.node129,1,0,0,2097152
221919,221920,129,955125.node129,1,0,0,2097152


In [6]:
print("The node with node_id = 129 runned {} jobs".format(jobs_node_129.shape[0]))

The node with node_id = 129 runned 75 jobs


#### different entries for the same job? look at the mem_requested

In [7]:
jobs_node_129.groupby(['mem_requested']).size().reset_index(name='counts')

Unnamed: 0,mem_requested,counts
0,524288,24
1,2097152,51


### Why there are jobs (with the same ID) runned in the same node 

Looking to the jobs_to_nodes table, we have 

In [8]:
df = jobs_to_nodes_whole_data.groupby(['job_id_string','node_id']).size().reset_index(name='counts')
df = df.sort_values('counts')
df = df[df['counts'] > 1]
df = df[df['node_id'] == 1]
df

Unnamed: 0,job_id_string,node_id,counts
28614,1200006.node129,1,2
117042,1317035.node129,1,2
117045,1317037.node129,1,2
60188,1234429.node129,1,2
60175,1234423.node129,1,2
334647,804561.node129,1,2
286054,586808.node129,1,2
286062,586816.node129,1,2
35594,1209879.node129,1,2
35578,1209857.node129,1,2


#### for example the job 1008449.node129 has multiple entries in the same node

In [9]:
jobs_to_nodes_whole_data[jobs_to_nodes_whole_data['job_id_string'] == '1008449.node129']

Unnamed: 0,job2node_id,node_id,job_id_string,ncpus,ngpus,nmics,mem_requested
227636,227637,33,1008449.node129,1,0,0,2097152
227637,227638,33,1008449.node129,1,0,0,2097152
227638,227639,33,1008449.node129,1,0,0,2097152
227639,227640,33,1008449.node129,1,0,0,2097152
227640,227641,33,1008449.node129,1,0,0,2097152
227641,227642,33,1008449.node129,1,0,0,2097152
227642,227643,33,1008449.node129,1,0,0,2097152
227643,227644,34,1008449.node129,1,0,0,2097152
227644,227645,34,1008449.node129,1,0,0,2097152
227645,227646,34,1008449.node129,1,0,0,2097152


In [10]:
jobs_whole_data[jobs_whole_data['job_id_string'] == '1008449.node129']

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
205160,207183,1008449.node129,xuGa2O3_PH,debug,2014-08-16 16:16:57,2014-08-16 16:16:58,2014-08-16 16:17:02,gcocco00@node129.eurora.cineca.it,0,0,0,,,,COMPLETED,


In [11]:
jobs_whole_data.shape

(405771, 16)

### Incomplete jobs

In [12]:
# incompleted_jobs_data = jobs_whole_data[jobs_whole_data.exit_status != 'COMPLETED']
incompleted_jobs_data = jobs_whole_data[jobs_whole_data.end_time == '0000-00-00 00:00:00']
print("incompleted_jobs_data contains {} records".format(incompleted_jobs_data.shape[0]))

incompleted_jobs_data contains 900 records


In [13]:
incompleted_jobs_data.sort_values('end_time')

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
6576,6578,508345.node129,pol5CBn500_LJtu,parallel,2014-04-03 12:42:51,2014-04-06 15:18:01,0000-00-00 00:00:00,lmucciol@node129.eurora.cineca.it,1,8,4,04:00,,,RUN,
359121,361144,1560013.node129,script3.sh,parallel,2015-01-12 12:04:28,2015-01-13 08:16:44,0000-00-00 00:00:00,ccaddeo0@node129.eurora.cineca.it,1,16,14,04:00,,,RUN,
359124,361147,1560017.node129,script3.sh,parallel,2015-01-12 12:04:28,2015-01-13 08:53:20,0000-00-00 00:00:00,ccaddeo0@node129.eurora.cineca.it,1,16,14,04:00,,,RUN,
359127,361150,1560021.node129,script3.sh,parallel,2015-01-12 12:04:29,2015-01-13 09:12:21,0000-00-00 00:00:00,ccaddeo0@node129.eurora.cineca.it,1,16,14,04:00,,,RUN,
359130,361153,1560026.node129,he_h2o_031,parallel,2015-01-12 12:10:10,2015-01-13 09:23:52,0000-00-00 00:00:00,gtiana00@node129.eurora.cineca.it,5,80,70,04:00,,,RUN,
359167,361190,1560086.node129,e1nuba_119.csh,parallel,2015-01-12 15:43:01,2015-01-13 15:07:09,0000-00-00 00:00:00,cmorgill@node129.eurora.cineca.it,1,16,1,04:00,,,RUN,
359448,361471,1560370.node129,run_test_restart.sh,parallel,2015-01-12 17:07:18,2015-01-13 13:38:33,0000-00-00 00:00:00,gcostan1@node129.eurora.cineca.it,1,16,4,04:00,,,RUN,
359461,361484,1560400.node129,test.1njq,parallel,2015-01-12 17:35:50,2015-01-13 17:21:39,0000-00-00 00:00:00,ldegioia@node129.eurora.cineca.it,1,8,2,04:00,,,RUN,
359468,361491,1560410.node129,test.1njq,parallel,2015-01-12 17:36:13,2015-01-13 17:29:49,0000-00-00 00:00:00,ldegioia@node129.eurora.cineca.it,1,8,2,04:00,,,RUN,
359475,361498,1560420.node129,test.1njq,parallel,2015-01-12 17:36:50,2015-01-13 17:29:50,0000-00-00 00:00:00,ldegioia@node129.eurora.cineca.it,1,8,2,04:00,,,RUN,


## jobs with 0 resources req

In [14]:
node_req_0 = jobs_whole_data[jobs_whole_data['node_req'] == 0] 
node_req_0 = node_req_0[node_req_0['cpu_req'] == 0] 
node_req_0.shape

print("{} / {} jobs without resources request".format(node_req_0.shape[0], jobs_whole_data.shape[0]))

326060 / 405771 jobs without resources request


In [15]:
node_req_0.sort_values('time_req')

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
57491,57613,588117.node129,STDIN,system,2014-04-14 11:48:41,2014-04-14 11:48:42,2014-04-14 12:49:37,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57580,57702,588230.node129,STDIN,system,2014-04-14 13:24:06,2014-04-14 13:24:07,2014-04-14 14:24:54,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
58936,59177,589926.node129,STDIN,p_devel,2014-04-15 14:36:59,2014-04-15 14:36:59,2014-04-15 14:40:14,faffinit@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57887,58009,588596.node129,STDIN,system,2014-04-14 17:26:21,2014-04-14 17:26:21,2014-04-14 18:10:21,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57459,57581,588013.node129,STDIN,system,2014-04-14 10:47:44,2014-04-14 10:47:44,2014-04-14 11:48:31,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
58855,59096,589706.node129,STDIN,system,2014-04-15 10:47:28,2014-04-15 10:47:29,2014-04-15 11:48:10,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
59208,59449,590236.node129,STDIN,system,2014-04-15 17:07:18,2014-04-15 17:07:19,2014-04-15 18:08:08,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
53056,53063,582427.node129,STDIN,p_devel,2014-04-10 17:37:51,2014-04-10 17:39:56,2014-04-10 18:13:17,ibaccare@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
56158,56165,586629.node129,STDIN,system,2014-04-12 13:54:42,2014-04-12 13:54:43,2014-04-12 14:04:38,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
59286,59527,590367.node129,STDIN,system,2014-04-15 18:12:29,2014-04-15 18:12:30,2014-04-15 18:12:47,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,


In [22]:
node_req_time_NaN = jobs_whole_data[jobs_whole_data['time_req'] == "--"]
node_req_time_NaN

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
53056,53063,582427.node129,STDIN,p_devel,2014-04-10 17:37:51,2014-04-10 17:39:56,2014-04-10 18:13:17,ibaccare@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
56158,56165,586629.node129,STDIN,system,2014-04-12 13:54:42,2014-04-12 13:54:43,2014-04-12 14:04:38,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57459,57581,588013.node129,STDIN,system,2014-04-14 10:47:44,2014-04-14 10:47:44,2014-04-14 11:48:31,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57491,57613,588117.node129,STDIN,system,2014-04-14 11:48:41,2014-04-14 11:48:42,2014-04-14 12:49:37,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57580,57702,588230.node129,STDIN,system,2014-04-14 13:24:06,2014-04-14 13:24:07,2014-04-14 14:24:54,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57599,57721,588247.node129,STDIN,system,2014-04-14 14:24:57,2014-04-14 14:31:07,2014-04-14 15:31:50,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
57887,58009,588596.node129,STDIN,system,2014-04-14 17:26:21,2014-04-14 17:26:21,2014-04-14 18:10:21,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
58855,59096,589706.node129,STDIN,system,2014-04-15 10:47:28,2014-04-15 10:47:29,2014-04-15 11:48:10,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
58936,59177,589926.node129,STDIN,p_devel,2014-04-15 14:36:59,2014-04-15 14:36:59,2014-04-15 14:40:14,faffinit@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,
59208,59449,590236.node129,STDIN,system,2014-04-15 17:07:18,2014-04-15 17:07:19,2014-04-15 18:08:08,gamati01@node129.eurora.cineca.it,0,0,0,--,,,COMPLETED,


In [23]:
node_time_req_0 = jobs_whole_data[jobs_whole_data['time_req'] < '01:00'] 
print("{} / {} jobs with time_req smaller than 1 min".format(node_time_req_0.shape[0], jobs_whole_data.shape[0]))

19854 / 405759 jobs with time_req smaller than 1 min


In [24]:
node_time_req_0.sort_values('time_req')

Unnamed: 0,job_id,job_id_string,job_name,queue,start_time,run_start_time,end_time,user,node_req,cpu_req,mem_req,time_req,deleted,dependency,exit_status,pbs_exit_code
398632,400655,1643680.node129,job10,debug,2015-06-11 14:29:09,2015-06-11 14:29:09,2015-06-11 14:29:11,a08trb14@node129.eurora.cineca.it,1,10,1,00:00,,,COMPLETED,
398786,400809,1643834.node129,job10,debug,2015-06-11 15:14:00,2015-06-11 15:14:01,2015-06-11 15:14:02,a08trb14@node129.eurora.cineca.it,1,10,1,00:00,,,COMPLETED,
401615,403638,1647522.node129,job.sh,debug,2015-06-18 14:44:03,2015-06-18 14:44:03,2015-06-18 14:44:03,a08trb14@node129.eurora.cineca.it,1,1,1,00:00,,,COMPLETED,
399667,401690,1645014.node129,exercise04.sh,parallel,2015-06-12 12:07:52,2015-06-12 12:07:52,2015-06-12 12:07:54,a08trb11@node129.eurora.cineca.it,1,4,1,00:00,,,COMPLETED,
398296,400319,1643342.node129,exercise02.sh,parallel,2015-06-10 17:06:17,2015-06-10 17:06:17,2015-06-10 17:07:08,a08trb11@node129.eurora.cineca.it,1,2,1,00:00,,,COMPLETED,
404456,406479,1651552.node129,STDIN,parallel,2015-07-16 16:54:03,2015-07-16 16:54:04,2015-07-16 16:57:12,mcestari@node129.eurora.cineca.it,1,16,1,00:00,,,COMPLETED,
398783,400806,1643831.node129,job10,debug,2015-06-11 15:13:28,2015-06-11 15:13:28,2015-06-11 15:13:30,a08trb14@node129.eurora.cineca.it,1,10,1,00:00,,,COMPLETED,
397899,399922,1642928.node129,job6,debug,2015-06-10 15:18:00,2015-06-10 15:18:00,2015-06-10 15:18:02,a08trb14@node129.eurora.cineca.it,1,16,1,00:00,,,COMPLETED,
398100,400123,1643130.node129,job8,debug,2015-06-10 15:58:49,2015-06-10 15:58:50,2015-06-10 15:58:51,a08trb14@node129.eurora.cineca.it,1,4,1,00:00,,,COMPLETED,
6095,6096,507746.node129,STDIN,visual,2014-04-03 09:33:08,2014-04-03 09:33:08,2014-04-03 09:36:17,rmucci00@node129.eurora.cineca.it,0,0,0,00:00,,,COMPLETED,


### Jobs runned on interval of measurements failure