<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime

BASE_RAW_DATA_DIR = '../data/raw'

GPU_CSV_FILE = BASE_RAW_DATA_DIR + '/gpu.csv'
"""
str: gpu.csv file location 
"""

CHECK_CSV_FILE = BASE_RAW_DATA_DIR +  '/application-checkpoints.csv'
"""
str: application-checkpoints.csv filename file location 
"""

TASK_CSV_FILE = BASE_RAW_DATA_DIR + '/task-x-y.csv'
"""
str: task-x-y.csv file location 
"""

TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ'
"""
str: string used to format timestamp for datetime conversion
"""

def timestamp_conv(df):
    """ Converts a timestamp to datetime
    ----------
    df
        dataframe to convert to datetime
    -------
    float
         converted timestamp
    """
    df = df.apply(lambda x: datetime.strptime(x, TIMESTAMP_FORMAT))
    return(df)

def clean_gpu(gpu_df):
    """Clean gpu dataframe by dropping uneeded serial number and
    fixes timestamp format to datetime

    Parameters
    ----------
    gpu_df
        gpu dataframe to clean

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Drop uneeded serial column

    gpu_df.drop(columns='gpuSerial', inplace=True)
    gpu_df['timestamp'] = timestamp_conv(gpu_df['timestamp'])
    
    return(gpu_df)

def merge_check_task(checkpoints_df, tasks_df):
    """merge (left join) checkpoints with task df through job and task id

    Parameters
    ----------
    checkpoints_df
        application checkpoints dataframe to merge

    tasks_df
        tasks dataframe to merge

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Use left join on taskId and jobId

    check_task_df = checkpoints_df.merge(tasks_df,
                                     on=['taskId', 'jobId'], how='left')
    return (check_task_df)

def clean_check_task(check_task_df):
    """Removes uneeded ids for merged application checkpoints and tasks df 
    and fixes timestamp format to datetime

    Parameters
    ----------
    check_task_df
         merged application checkpoints and tasks df to clean

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Drop uneeded ids

    check_task_df['timestamp'] = timestamp_conv(check_task_df['timestamp'])

    return(check_task_df)

gpu_df = pd.read_csv(GPU_CSV_FILE)
checkpoints_df = pd.read_csv(CHECK_CSV_FILE)
tasks_df = pd.read_csv(TASK_CSV_FILE)

gpu_df = clean_gpu(gpu_df)
check_task_df = merge_check_task(checkpoints_df, tasks_df)
check_task_df = clean_check_task(check_task_df)

In [12]:
check_task_df_start = check_task_df[check_task_df['eventType'] == 'START'].copy()
check_task_df_stop = check_task_df[check_task_df['eventType'] == 'STOP'].copy()

check_task_df_start.rename(index=str, columns={"timestamp": "start_time"}, inplace = True)
check_task_df_stop.rename(index=str, columns={"timestamp": "stop_time"}, inplace = True)

check_task_df_stop.drop('eventType', axis = 1, inplace = True)
check_task_df_start.drop('eventType', axis = 1, inplace = True)

In [13]:
check_task_flat_df = pd.merge(
    check_task_df_start, check_task_df_stop,
    on=['hostname', 'eventName', 'taskId'])

In [14]:
check_task_flat_df.head()

Unnamed: 0,start_time,hostname,eventName,jobId_x,taskId,x_x,y_x,level_x,stop_time,jobId_y,x_y,y_y,level_y
0,2018-11-08 07:42:29.842,0d56a730076643d585f77e00d2d8521a00000N,Saving Config,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,20fb9fcf-a927-4a4b-a64c-70258b66b42d,238,4,12,2018-11-08 07:42:29.845,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,238,4,12
1,2018-11-08 07:42:29.845,0d56a730076643d585f77e00d2d8521a00000N,Render,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,20fb9fcf-a927-4a4b-a64c-70258b66b42d,238,4,12,2018-11-08 07:43:10.965,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,238,4,12
2,2018-11-08 07:43:56.239,0d56a730076643d585f77e00d2d8521a00000N,Uploading,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,3dd4840c-47f2-4dcc-a775-df2ef6498d71,238,5,12,2018-11-08 07:43:57.245,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,238,5,12
3,2018-11-08 07:44:47.555,0d56a730076643d585f77e00d2d8521a00000N,Saving Config,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,c9e249d8-52ed-40c6-8713-b5cbf02ea87e,151,75,12,2018-11-08 07:44:47.557,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,151,75,12
4,2018-11-08 07:47:38.457,0d56a730076643d585f77e00d2d8521a00000N,Uploading,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,c8c93ada-22ea-4ca8-aa1b-9f9d4b89bfba,102,109,12,2018-11-08 07:47:39.357,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,102,109,12


In [15]:
check_task_flat_df_r = check_task_flat_df[
    (check_task_flat_df['start_time'] >= gpu_df['timestamp'][0]) 
    & (check_task_flat_df['stop_time'] <= gpu_df['timestamp'][len(gpu_df)-1])].copy()

len(check_task_flat_df_r)

17252

In [47]:
import pandasql as ps
import sqlite3

# connection to sql
conn = sqlite3.connect(':memory:')

# move dataframes to sql
check_task_flat_df_r.to_sql('CheckTask', conn, index=False)
gpu_df.to_sql('Gpu', conn, index=False)

# SQL query
query = '''
SELECT *
FROM Gpu
LEFT JOIN CheckTask ON gpu.hostname = CheckTask.hostname
WHERE gpu.timestamp >= CheckTask.start_time AND gpu.timestamp <= CheckTask.stop_time
'''

newdf = pd.read_sql_query(query, conn)
newdf = newdf.loc[:,~newdf.columns.duplicated()]

In [48]:
functions = {
    'powerDrawWatt': 'mean', 'gpuTempC': 'mean',
    'gpuUtilPerc': 'mean', 'gpuMemUtilPerc': 'mean',
    'start_time': '', 'stop_time': ''}

newdf = newdf.groupby(
    ['hostname', 'eventName', 'taskId'], as_index=False, sort=False
).agg(functions)


AttributeError: 'SeriesGroupBy' object has no attribute ''

In [39]:
len(newdf)

10184

In [44]:
newdf.head()

Unnamed: 0,hostname,eventName,taskId,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc,start_time,stop_time
0,0d56a730076643d585f77e00d2d8521a00000Q,TotalRender,8180fce2-be69-4c77-89bb-72448abbe75a,96.807273,37.590909,70.318182,37.863636,2018-11-08 08:27:10.606000,2018-11-08 08:27:54.895000
1,0d56a730076643d585f77e00d2d8521a00000Q,Render,8180fce2-be69-4c77-89bb-72448abbe75a,96.807273,37.590909,70.318182,37.863636,2018-11-08 08:27:10.608000,2018-11-08 08:27:53.796000
2,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,Uploading,f11feb53-bb86-4393-909e-d4cbe942d540,42.44,41.0,0.0,0.0,2018-11-08 08:27:10.839000,2018-11-08 08:27:11.893000
3,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,Tiling,f11feb53-bb86-4393-909e-d4cbe942d540,42.44,41.0,0.0,0.0,2018-11-08 08:27:10.846000,2018-11-08 08:27:11.882000
4,b9a1fa7ae2f74eb68f25f607980f97d700000H,TotalRender,616720aa-2e66-410b-bb97-f360e8cfa90c,91.566957,38.695652,71.0,39.913043,2018-11-08 08:27:10.612000,2018-11-08 08:27:56.265000


In [46]:
newdf.head()

Unnamed: 0,timestamp,hostname,gpuUUID,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc,start_time,eventName,jobId_x,taskId,x_x,y_x,level_x,stop_time,jobId_y,x_y,y_y,level_y
0,2018-11-08 08:27:11.089000,0d56a730076643d585f77e00d2d8521a00000Q,GPU-d84a1024-9381-c725-3b85-dd7143e64c35,27.18,35,0,0,2018-11-08 08:27:10.606000,TotalRender,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,8180fce2-be69-4c77-89bb-72448abbe75a,156,186,12,2018-11-08 08:27:54.895000,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,156,186,12
1,2018-11-08 08:27:11.089000,0d56a730076643d585f77e00d2d8521a00000Q,GPU-d84a1024-9381-c725-3b85-dd7143e64c35,27.18,35,0,0,2018-11-08 08:27:10.608000,Render,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,8180fce2-be69-4c77-89bb-72448abbe75a,156,186,12,2018-11-08 08:27:53.796000,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,156,186,12
2,2018-11-08 08:27:10.949000,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,GPU-8792a29c-529e-1837-1806-c669cd9b1960,42.44,41,0,0,2018-11-08 08:27:10.839000,Uploading,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,f11feb53-bb86-4393-909e-d4cbe942d540,200,23,12,2018-11-08 08:27:11.893000,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,200,23,12
3,2018-11-08 08:27:10.949000,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,GPU-8792a29c-529e-1837-1806-c669cd9b1960,42.44,41,0,0,2018-11-08 08:27:10.846000,Tiling,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,f11feb53-bb86-4393-909e-d4cbe942d540,200,23,12,2018-11-08 08:27:11.882000,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,200,23,12
4,2018-11-08 08:27:10.760000,b9a1fa7ae2f74eb68f25f607980f97d700000H,GPU-38bbf3b6-80fb-7e6f-6678-ee45035507ab,42.21,37,0,0,2018-11-08 08:27:10.612000,TotalRender,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,616720aa-2e66-410b-bb97-f360e8cfa90c,160,14,12,2018-11-08 08:27:56.265000,1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705,160,14,12


In [1]:
newdf.head()

NameError: name 'newdf' is not defined

In [None]:
check_task_flat_df_r
gpu_df

df_merge = pd.merge(check_task_flat_df_r, gpu_df, on='hostname')

df_merge = df_merge.query('timestamp >= start_time and timestamp <= stop_time')

df_out = df_A.merge(df_merge, on=['start_date','end_date'], how='left').dropna()

df_out.head()

In [None]:
len(timestamp_df)

In [None]:
# set keys as indexes for join 
    
#gpu_df.set_index('timestamp', inplace=True)
#check_task_df.set_index('timestamp', inplace=True)
    
# sort by index

#gpu_df.sort_index(inplace=True)
#check_task_df.sort_index(inplace=True)

gpu_df.sort_values(by=['timestamp'], inplace=True)
check_task_df.sort_values(by=['timestamp'], inplace=True)

# Make timestamp df

timestamp_df = check_task_df.copy()
timestamp_df.drop(['hostname', 'eventName', 'eventType', 'x', 'y', 'level'], axis=1, inplace= True)

timestamp_df = pd.merge_asof(timestamp_df,gpu_df,
                       left_on = ['timestamp'], right_on = ['timestamp'],
                       tolerance = pd.Timedelta('4ms'), direction = 'nearest')

In [None]:
len(timestamp_df)

In [None]:
timestamp_df.head()

In [None]:
timestamp_df.isnull().sum()

In [None]:
timestamp_df.dropna(inplace = True)

In [None]:
timestamp_df.isnull().sum()

In [None]:
len(timestamp_df)

In [None]:
check_task_gpu_df = pd.merge(timestamp_df, check_task_df, on = ['hostname', 'timestamp'])

In [None]:
check_task_gpu_df.isnull().sum()

In [None]:
len(check_task_gpu_df)

In [None]:
timestamp_df.head()

In [None]:
check_task_df.head()

In [None]:
check_task_gpu_df.head()

In [None]:
check_task_gpu_df.isnull().sum()

In [None]:
len(check_task_gpu_df)