<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [42]:
import pandas as pd
import numpy as np
from datetime import datetime

BASE_RAW_DATA_DIR = '../data/raw'

GPU_CSV_FILE = BASE_RAW_DATA_DIR + '/gpu.csv'
"""
str: gpu.csv file location 
"""

CHECK_CSV_FILE = BASE_RAW_DATA_DIR +  '/application-checkpoints.csv'
"""
str: application-checkpoints.csv filename file location 
"""

TASK_CSV_FILE = BASE_RAW_DATA_DIR + '/task-x-y.csv'
"""
str: task-x-y.csv file location 
"""

TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ'
"""
str: string used to format timestamp for datetime conversion
"""

def timestamp_conv(df):
    """ Converts a timestamp to datetime
    ----------
    df
        dataframe to convert to datetime
    -------
    float
         converted timestamp
    """
    df = df.apply(lambda x: (datetime.strptime(x, TIMESTAMP_FORMAT)))
    return(df)

def clean_gpu(gpu_df):
    """Clean gpu dataframe by dropping uneeded serial number and
    fixes timestamp format to datetime

    Parameters
    ----------
    gpu_df
        gpu dataframe to clean

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Drop uneeded serial column

    gpu_df.drop(columns='gpuSerial', inplace=True)
    gpu_df['timestamp'] = timestamp_conv(gpu_df['timestamp'])
    
    return(gpu_df)

def merge_check_task(checkpoints_df, tasks_df):
    """merge (left join) checkpoints with task df through job and task id

    Parameters
    ----------
    checkpoints_df
        application checkpoints dataframe to merge

    tasks_df
        tasks dataframe to merge

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Use left join on taskId and jobId

    check_task_df = checkpoints_df.merge(tasks_df,
                                     on=['taskId', 'jobId'], how='left')
    return (check_task_df)

def clean_check_task(check_task_df):
    """Removes uneeded ids for merged application checkpoints and tasks df 
    and fixes timestamp format to datetime

    Parameters
    ----------
    check_task_df
         merged application checkpoints and tasks df to clean

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Drop uneeded ids

    check_task_df.drop(columns= ['jobId', 'taskId'], inplace=True)
    check_task_df['timestamp'] = timestamp_conv(check_task_df['timestamp'])

    return(check_task_df)

gpu_df = pd.read_csv(GPU_CSV_FILE)
checkpoints_df = pd.read_csv(CHECK_CSV_FILE)
tasks_df = pd.read_csv(TASK_CSV_FILE)

gpu_df = clean_gpu(gpu_df)
check_task_df = merge_check_task(checkpoints_df, tasks_df)
check_task_df = clean_check_task(check_task_df)

# set keys as indexes for join 
    
gpu_df.set_index('timestamp', inplace=True)
check_task_df.set_index('timestamp', inplace=True)
    
# sort by index

gpu_df.sort_index(inplace=True)
check_task_df.sort_index(inplace=True)

# Make timestamp df

timestamp_df = check_task_df.copy()
timestamp_df.drop(['hostname', 'eventName', 'eventType', 'x', 'y', 'level'], axis=1, inplace= True)

gpu_df = pd.merge_asof(gpu_df, timestamp_df,
                       left_index = True, right_index = True,
                       tolerance = pd.Timedelta('0ms'), direction = 'nearest')

check_task_gpu_df = pd.merge(gpu_df, check_task_df, on = ['hostname', 'timestamp'])

In [34]:
gpu_df.head()

Unnamed: 0_level_0,hostname,gpuUUID,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-11-08 07:41:25.772,d8241877cd994572b46c861e5d144c85000017,GPU-e7c85eef-7253-2919-5f20-4a6325716726,25.2,32,0,0
2018-11-08 07:41:25.772,8b6a0eebc87b4cb2b0539e81075191b9000009,GPU-9477ad99-3f4a-88d1-1b59-6319b7d1e39d,26.22,30,0,0
2018-11-08 07:41:25.776,35bd84d72aca403b8129a7d652cc2750000008,GPU-57c0f325-92d5-41f2-a150-9b29b8317776,25.84,31,0,0
2018-11-08 07:41:25.778,4ad946d4435c42dabb5073531ea4f31500000Z,GPU-a2a14699-64df-bf7a-c5c7-7daa271482ac,26.32,29,0,0
2018-11-08 07:41:25.782,4c72fae95b9147189a0559269a6953ff00000A,GPU-feeb9f98-518f-fa06-12ab-d8df1b597944,25.95,27,0,0


In [35]:
timestamp_df.head()

2018-11-08 07:41:30.957
2018-11-08 07:41:30.957
2018-11-08 07:41:30.960
2018-11-08 07:41:30.960
2018-11-08 07:41:30.962


In [36]:
check_task_df.head()

Unnamed: 0_level_0,hostname,eventName,eventType,x,y,level
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-11-08 07:41:30.957,95b4ae6d890e4c46986d91d7ac4bf08200000Q,TotalRender,START,128,5,12
2018-11-08 07:41:30.957,95b4ae6d890e4c46986d91d7ac4bf08200000Q,Saving Config,START,128,5,12
2018-11-08 07:41:30.960,95b4ae6d890e4c46986d91d7ac4bf08200000Q,Render,START,128,5,12
2018-11-08 07:41:30.960,95b4ae6d890e4c46986d91d7ac4bf08200000Q,Saving Config,STOP,128,5,12
2018-11-08 07:41:30.962,95b4ae6d890e4c46986d91d7ac4bf08200000D,TotalRender,START,128,9,12


In [38]:
check_task_gpu_df.head()

Unnamed: 0_level_0,hostname,gpuUUID,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc,eventName,eventType,x,y,level
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-11-08 07:41:31.313,4ad946d4435c42dabb5073531ea4f315000003,GPU-ca5f003a-4b66-f1b9-8cad-7899f6555909,26.6,28,0,0,TotalRender,START,87,3,12
2018-11-08 07:41:31.313,4ad946d4435c42dabb5073531ea4f315000003,GPU-ca5f003a-4b66-f1b9-8cad-7899f6555909,26.6,28,0,0,Saving Config,START,87,3,12
2018-11-08 07:41:31.794,4c72fae95b9147189a0559269a6953ff00001A,GPU-612ee29b-3d29-c9eb-e679-f59934f76713,24.86,30,0,0,TotalRender,START,180,7,12
2018-11-08 07:41:31.794,4c72fae95b9147189a0559269a6953ff00001A,GPU-612ee29b-3d29-c9eb-e679-f59934f76713,24.86,30,0,0,Saving Config,START,180,7,12
2018-11-08 07:41:31.834,4c72fae95b9147189a0559269a6953ff00000A,GPU-feeb9f98-518f-fa06-12ab-d8df1b597944,25.94,27,0,0,Render,START,70,7,12


In [43]:
check_task_gpu_df.isnull().sum()

hostname          0
gpuUUID           0
powerDrawWatt     0
gpuTempC          0
gpuUtilPerc       0
gpuMemUtilPerc    0
eventName         0
eventType         0
x                 0
y                 0
level             0
dtype: int64