In [9]:
import pandas as pd
import csv
import os
import numpy as np
import datetime
from xlsxwriter.workbook import Workbook
from xlsxwriter import Workbook
import time
import pytz


In [2]:
Working_dir = os.getcwd()
Data_dir = os.path.join(Working_dir, "Raw data")

In [3]:
# lsf_df = pd.read_csv( os.path.join( Data_dir, "LSF_Cresco6_v2.csv" ), header=0, delimiter=";" )
lsf_df = pd.read_csv( os.path.join( Data_dir, 'LSF_Cresco6.csv' ), header=0, delimiter=';' )

In [4]:
lsf_df.head()

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost
0,1,51659,1,lsf,cresco6_h1,/afs/.enea.it/software/lsf/9.1.3/portici/conf,hostname,EXIT,1528886340,1528886343,1
1,2,51936,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528896981,1528896985,4
2,3,51939,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528897444,1528897447,4
3,4,51940,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528897682,1528897683,4
4,5,51942,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528897832,1528897837,4


-----
## Minimal preprocessing
- Convert unix timestamps to datetime format
- Sort jobs by finish time 

In [55]:
# # tz - makes it a localized timestamp
# tz = pytz.timezone('Europe/Rome')
# lsf_df['start_timestamp_py'] = lsf_df['start'].apply(lambda x: datetime.datetime.fromtimestamp(x, tz))
# lsf_df['stop_timestamp_py'] = lsf_df['stop'].apply(lambda x: datetime.datetime.fromtimestamp(x, tz))

In [57]:
# fromtimestamp - local timestamp
lsf_df['start_timestamp_py'] = lsf_df['start'].apply(lambda x: datetime.datetime.fromtimestamp(x))
lsf_df['stop_timestamp_py'] = lsf_df['stop'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [58]:
lsf_df = lsf_df.sort_values('start_timestamp_py')

In [59]:
lsf_df.head()

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost,start_timestamp_py,stop_timestamp_py
0,1,51659,1,lsf,cresco6_h1,/afs/.enea.it/software/lsf/9.1.3/portici/conf,hostname,EXIT,1528886340,1528886343,1,2018-06-13 12:39:00,2018-06-13 12:39:03
1,2,51936,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528896981,1528896985,4,2018-06-13 15:36:21,2018-06-13 15:36:25
2,3,51939,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528897444,1528897447,4,2018-06-13 15:44:04,2018-06-13 15:44:07
3,4,51940,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528897682,1528897683,4,2018-06-13 15:48:02,2018-06-13 15:48:03
4,5,51942,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,EXIT,1528897832,1528897837,4,2018-06-13 15:50:32,2018-06-13 15:50:37


### Zero stop time observed in the following number of cases

In [60]:
print(lsf_df[lsf_df.stop == 0].shape[0], "\nMissing data percentage", \
      round(lsf_df[lsf_df.stop == 0].shape[0]/lsf_df.shape[0]*100, 2))

10543 
Missing data percentage 39.09


### Missing data is observed in the folowing period

In [61]:
lsf_df[lsf_df.stop == 0]['start_timestamp_py'].min(), lsf_df[lsf_df.stop == 0]['start_timestamp_py'].max()

(Timestamp('2018-06-14 18:54:44'), Timestamp('2018-12-12 11:23:06'))

### While the dataset covers the period

In [62]:
print( 'Start:\t', lsf_df['start_timestamp_py'].min(), '\nStop:\t', lsf_df['stop_timestamp_py'].max() )

Start:	 2018-06-13 12:39:00 
Stop:	 2018-12-13 09:20:57


### Job id of jobs with 0 stop time submitted by different users

In [63]:
jobids = lsf_df[(lsf_df.stop == 0)]['jobid']

### Any other stop/start time errors

In [64]:
np.argwhere((lsf_df[lsf_df.stop == 0].index == lsf_df[lsf_df.stop < lsf_df.start].index)==False)

array([], shape=(0, 1), dtype=int64)

No

### Users who submitted jobs with zero reported stop time

In [21]:
lsf_df[lsf_df.stop == 0]["user"].unique(), '\n', len(lsf_df[lsf_df.stop == 0]["user"].unique()))

['iannone' 'guarnier' 'vlad' 'adani' 'anavf' 'sergio' 'disidoro' 'gusso'
 'eugenio' 'sannino2' 'briganti' 'ambrosin' 'vfain' 'cappelle' 'romanelg'
 'giuseps' 'calchett' 'aprea' 'dinardo' 'pergreff' 'gutierre' 'denicola'
 'gianninl' 'dcecere' 'zhao' 'buonocor' 'kb' 'pconsole' 'polidori'
 'palombi' 'crescenz' 'acolange' 'cannunz' 'amchiar' 'meineri' 'battista'] 
 36


In [65]:
unique_users_jobids = lsf_df[lsf_df.stop == 0].drop_duplicates('user')[['jobid', 'user', 'start', 'stop']]

excel_writer_unique_users_jobids = pd.ExcelWriter( os.path.join( Data_dir, "LSF_Cresco6_zero_stop_time.xlsx"))
unique_users_jobids.to_excel(excel_writer_unique_users_jobids)
excel_writer_unique_users_jobids.save()

# unique_users_jobids

### Numhost parameter for jobs with empty finish time

In [66]:
lsf_df[lsf_df.stop == 0]["numhost"].unique()

array([         0, 1535645928, 1535646037, 1542636100, 1543999309,
       1543999348, 1544014268], dtype=int64)

In [67]:
print("Number of cases when numhost parameter == 0: ", lsf_df[(lsf_df.stop == 0) & (lsf_df.numhost == 0)].shape[0])

Number of cases when numhost parameter == 0:  10531


### Application names of jobs with zero finish time

In [68]:
print( "Unique names:", len(lsf_df[lsf_df.stop == 0]["executable"].unique()))

Unique names: 357


In [70]:
# lsf_df[lsf_df.stop == 0]["executable"].unique()

### Directory names of jobs with zero finish time

In [72]:
print( "Unique directory names:", len(lsf_df[lsf_df.stop == 0]["directory"].unique()))

Unique directory names: 470


In [71]:
# lsf_df[lsf_df.stop == 0]["directory"].unique()

### Number of cores used by jobs with error in finish time

In [73]:
lsf_df[lsf_df.stop == 0]["numcores"].min(), lsf_df[lsf_df.stop == 0]["numcores"].max()

(1, 10032)

In [74]:
print("Number of cores variation:", len(lsf_df[lsf_df.stop == 0]["numcores"].unique()), "unique values")

Number of cores variation: 65 unique values


### Unique queue names when stop time != 0

In [77]:
lsf_df[lsf_df.stop != 0].queue.unique()

array(['cresco6_h1', 'cresco6_test', 'hpc_ha_h24', 'system',
       'cresco6_48h24', 'small_10m', 'cresco6_h144', 'cresco4_16h24',
       'cresco6_h4'], dtype=object)

## Data inconsistency note
- 10543 entries with 0 as stop unix timestamp, which is 39% of all entries
- The entries are spread throughout all the months of June - December 2018

# Important
If not agreed otherwise, these entries should be discarded

------

### Data covers the following period

In [75]:
print(lsf_df.start_timestamp_py.min(), " - ",  lsf_df.stop_timestamp_py.max())

2018-06-13 12:39:00  -  2018-12-13 09:20:57


### Data describes processing of the following number of jobs

In [28]:
len(lsf_df.jobid.unique())

26332

### Percentage of unique jobs out of all records

In [31]:
round(len(lsf_df.jobid.unique())/lsf_df.shape[0]*100, 2)

97.63

### Checking for duplicates

In [60]:
lsf_df[lsf_df.duplicated(keep=False)]

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost,start_timestamp_py,stop_timestamp_py


No duplicates found




----------------

# From this time on - analysis is performed on jobs with non-zero stop time

## Add duration of all the jobs processing 

In [53]:
lsf_df_cleaned_v1 = lsf_df[lsf_df.stop != 0].copy()

In [54]:
lsf_df_cleaned_v1 = lsf_df_cleaned_v1.assign(job_duration_sec = lsf_df_cleaned_v1.stop - lsf_df_cleaned_v1.start)
lsf_df_cleaned_v1.head()

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost,start_timestamp_py,stop_timestamp_py,job_duration_sec
0,1,51659,1,lsf,cresco6_h1,/afs/.enea.it/software/lsf/9.1.3/portici/conf,hostname,32,1528886340,1528886343,1,2018-06-13 10:39:00,2018-06-13 10:39:03,3
1,2,51936,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528896981,1528896985,4,2018-06-13 13:36:21,2018-06-13 13:36:25,4
2,3,51939,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897444,1528897447,4,2018-06-13 13:44:04,2018-06-13 13:44:07,3
3,4,51940,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897682,1528897683,4,2018-06-13 13:48:02,2018-06-13 13:48:03,1
4,5,51942,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897832,1528897837,4,2018-06-13 13:50:32,2018-06-13 13:50:37,5


## Add full name of executable = "directory name" + "/" + "executable name"

In [66]:
lsf_df_cleaned_v1['executable_full_name'] = \
lsf_df_cleaned_v1['directory'].apply(lambda x: \
                                     str(x) if type(x) in (float, int) else x ) + "/" \
+ lsf_df_cleaned_v1["executable"].apply( lambda x: str(x) if type(x) in (float, int) else \
                                    (x.strftime('%H-%M-%S') if type(x)==datetime.datetime else x) )

In [72]:
lsf_df_cleaned_v1.loc[0, "executable_full_name"], \
lsf_df_cleaned_v1.loc[0, "directory"], \
lsf_df_cleaned_v1.loc[0, "executable"]

('/afs/.enea.it/software/lsf/9.1.3/portici/conf/hostname',
 '/afs/.enea.it/software/lsf/9.1.3/portici/conf',
 'hostname')

## Unique jobids correspond to unique names?

In [82]:
len(lsf_df_cleaned_v1.jobid.unique()), len(lsf_df_cleaned_v1.executable_full_name.unique())

(15835, 4387)

No

In [83]:
excel_writer_lsf_cleaned = pd.ExcelWriter( os.path.join( Data_dir, "LSF_Cresco6_cleaned.xlsx"))
lsf_df_cleaned_v1.to_excel(excel_writer_lsf_cleaned)
excel_writer_lsf_cleaned.save()

## Queues names

In [33]:
print( np.unique(lsf_df.queue) )

['cresco4_16h24' 'cresco6_48h24' 'cresco6_h1' 'cresco6_h144' 'cresco6_h4'
 'cresco6_test' 'hpc_ha_h24' 'small_10m' 'system']


-----------

# Info needed
- queue duration - maybe
- discuss a question of jobs with finish time == 0 (39% of all the data entries/rows)
- discuss what to use from the dataset