In [2]:
import pandas as pd
import csv
import os
import numpy as np
import datetime
from xlsxwriter.workbook import Workbook
from xlsxwriter import Workbook
import time

In [3]:
Working_dir = os.getcwd()
Data_dir = os.path.join(Working_dir, "Raw data")

In [8]:
lsf_df = pd.read_excel( os.path.join( Data_dir, "LSF_Cresco6.xlsx" ), header=0 )

In [9]:
lsf_df.head()

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost
0,1,51659,1,lsf,cresco6_h1,/afs/.enea.it/software/lsf/9.1.3/portici/conf,hostname,32,1528886340,1528886343,1
1,2,51936,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528896981,1528896985,4
2,3,51939,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897444,1528897447,4
3,4,51940,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897682,1528897683,4
4,5,51942,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897832,1528897837,4


-----
## Minimal preprocessing
- Convert unix timestamps to datetime format
- Sort jobs by finish time 

In [10]:
lsf_df["start_timestamp_py"] = lsf_df["start"].apply(lambda x: datetime.datetime.utcfromtimestamp(x))
lsf_df["stop_timestamp_py"] = lsf_df["stop"].apply(lambda x: datetime.datetime.utcfromtimestamp(x))

In [16]:
lsf_df = lsf_df.sort_values('stop_timestamp_py')

In [17]:
lsf_df.head()

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost,start_timestamp_py,stop_timestamp_py
13503,13504,458347,48,disidoro,cresco6_48h24,minniwrf/run/LESOTHO/namelist_ultima/2015-12-07,./script_lancia_real.exe,32,1540476042,0,0,2018-10-25 14:00:42,1970-01-01
12216,12217,448984,192,disidoro,cresco6_48h24,minniwrf/run/LESOTHO/namelist_ultima/2015-09-07,./script_lancia_OMP.exe,32,1540306821,0,0,2018-10-23 15:00:21,1970-01-01
12213,12214,448981,48,disidoro,cresco6_48h24,minniwrf/run/LESOTHO/namelist_ultima/2015-09-06,./script_lancia_real.exe,32,1540306820,0,0,2018-10-23 15:00:20,1970-01-01
12214,12215,448982,192,disidoro,cresco6_48h24,minniwrf/run/LESOTHO/namelist_ultima/2015-09-06,./script_lancia_OMP.exe,32,1540306820,0,0,2018-10-23 15:00:20,1970-01-01
12215,12216,448983,48,disidoro,cresco6_48h24,minniwrf/run/LESOTHO/namelist_ultima/2015-09-07,./script_lancia_real.exe,32,1540306820,0,0,2018-10-23 15:00:20,1970-01-01


### Zero stop time observed in the following number of cases

In [84]:
lsf_df[lsf_df.stop == 0].shape[0],lsf_df[lsf_df.stop < lsf_df.start].shape[0], round(lsf_df[lsf_df.stop == 0].shape[0]/lsf_df.shape[0]*100, 2)

(10543, 10543, 39.09)

### And in the folowing period

In [24]:
lsf_df[lsf_df.stop == 0]['start_timestamp_py'].min(), lsf_df[lsf_df.stop == 0]['start_timestamp_py'].max()

(Timestamp('2018-06-14 16:54:44'), Timestamp('2018-12-12 10:23:06'))

### Any other stop/start time errors

In [88]:
np.argwhere((lsf_df[lsf_df.stop == 0].index == lsf_df[lsf_df.stop < lsf_df.start].index)==False)

array([], shape=(0, 1), dtype=int64)

### Users who submitted jobs with zero reported stop time

In [89]:
lsf_df[lsf_df.stop == 0]["user"].unique()

array(['disidoro', 'briganti', 'eugenio', 'vfain', 'ambrosin', 'guarnier',
       'giuseps', 'pergreff', 'sannino2', 'calchett', 'romanelg',
       'dinardo', 'aprea', 'cappelle', 'iannone', 'gusso', 'polidori',
       'palombi', 'gianninl', 'dcecere', 'crescenz', 'acolange',
       'amchiar', 'cannunz', 'sergio', 'kb', 'pconsole', 'gutierre',
       'meineri', 'battista', 'vlad', 'anavf', 'adani', 'denicola',
       'buonocor', 'zhao'], dtype=object)

### Numhost parameter for jobs with empty finish time

In [90]:
lsf_df[lsf_df.stop == 0]["numhost"].unique()

array([         0, 1543999309, 1543999348, 1544014268, 1535646037,
       1535645928, 1541659532], dtype=int64)

## Data inconsistency note
- 10543 entries with 0 as stop unix timestamp, which is 39% of all entries
- The entries are spread throughout all the months of June - December 2018

# Important
If not agreed otherwise, these entries should be discarded

------

### Data covers the following period

In [34]:
print(lsf_df.start_timestamp_py.min(), " - ",  lsf_df.stop_timestamp_py.max())

2018-06-13 10:39:00  -  2018-12-13 08:20:57


### Data describes processing of the following number of jobs

In [28]:
len(lsf_df.jobid.unique())

26332

### Percentage of unique jobs out of all records

In [31]:
round(len(lsf_df.jobid.unique())/lsf_df.shape[0]*100, 2)

97.63

### Checking for duplicates

In [60]:
lsf_df[lsf_df.duplicated(keep=False)]

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost,start_timestamp_py,stop_timestamp_py


# From this time on - analysis is performed on jobs with non-zero stop time

## Add duration of all the jobs processing 

In [53]:
lsf_df_cleaned_v1 = lsf_df[lsf_df.stop != 0].copy()

In [54]:
lsf_df_cleaned_v1 = lsf_df_cleaned_v1.assign(job_duration_sec = lsf_df_cleaned_v1.stop - lsf_df_cleaned_v1.start)
lsf_df_cleaned_v1.head()

Unnamed: 0,id,jobid,numcores,user,queue,directory,executable,jobstatus,start,stop,numhost,start_timestamp_py,stop_timestamp_py,job_duration_sec
0,1,51659,1,lsf,cresco6_h1,/afs/.enea.it/software/lsf/9.1.3/portici/conf,hostname,32,1528886340,1528886343,1,2018-06-13 10:39:00,2018-06-13 10:39:03,3
1,2,51936,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528896981,1528896985,4,2018-06-13 13:36:21,2018-06-13 13:36:25,4
2,3,51939,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897444,1528897447,4,2018-06-13 13:44:04,2018-06-13 13:44:07,3
3,4,51940,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897682,1528897683,4,2018-06-13 13:48:02,2018-06-13 13:48:03,1
4,5,51942,192,guarnier,cresco6_test,MPIHELLO,./submit.sh,32,1528897832,1528897837,4,2018-06-13 13:50:32,2018-06-13 13:50:37,5


## Add full name of executable = directory/executable

In [66]:
lsf_df_cleaned_v1['executable_full_name'] = \
lsf_df_cleaned_v1['directory'].apply(lambda x: \
                                     str(x) if type(x) in (float, int) else x ) + "/" \
+ lsf_df_cleaned_v1["executable"].apply( lambda x: str(x) if type(x) in (float, int) else \
                                    (x.strftime('%H-%M-%S') if type(x)==datetime.datetime else x) )

In [72]:
lsf_df_cleaned_v1.loc[0, "executable_full_name"], \
lsf_df_cleaned_v1.loc[0, "directory"], \
lsf_df_cleaned_v1.loc[0, "executable"]

('/afs/.enea.it/software/lsf/9.1.3/portici/conf/hostname',
 '/afs/.enea.it/software/lsf/9.1.3/portici/conf',
 'hostname')

## Unique jobids correspond to unique names?

In [82]:
len(lsf_df_cleaned_v1.jobid.unique()), len(lsf_df_cleaned_v1.executable_full_name.unique())

(15835, 4387)

No

In [83]:
excel_writer_lsf_cleaned = pd.ExcelWriter( os.path.join( Data_dir, "LSF_Cresco6_cleaned.xlsx"))
lsf_df_cleaned_v1.to_excel(excel_writer_lsf_cleaned)
excel_writer_lsf_cleaned.save()

-----------

# Info needed
- queue duration - maybe
- discuss a question of jobs with finish time == 0 (39% of all the data entries/rows)
- discuss what to use from the dataset