<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime

BASE_RAW_DATA_DIR = '../data/raw'

GPU_CSV_FILE = BASE_RAW_DATA_DIR + '/gpu.csv'
"""
str: gpu.csv file location 
"""

CHECK_CSV_FILE = BASE_RAW_DATA_DIR +  '/application-checkpoints.csv'
"""
str: application-checkpoints.csv filename file location 
"""

TASK_CSV_FILE = BASE_RAW_DATA_DIR + '/task-x-y.csv'
"""
str: task-x-y.csv file location 
"""

TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ'
"""
str: string used to format timestamp for datetime conversion
"""

def timestamp_conv(df):
    """ Converts a timestamp to datetime
    ----------
    df
        dataframe to convert to datetime
    -------
    float
         converted timestamp
    """
    df = df.apply(lambda x: datetime.strptime(x, TIMESTAMP_FORMAT))
    return(df)

def clean_gpu(gpu_df):
    """Clean gpu dataframe by dropping uneeded serial number and
    fixes timestamp format to datetime

    Parameters
    ----------
    gpu_df
        gpu dataframe to clean

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Drop uneeded serial column

    gpu_df.drop(columns='gpuSerial', inplace=True)
    gpu_df['timestamp'] = timestamp_conv(gpu_df['timestamp'])
    
    return(gpu_df)

def merge_check_task(checkpoints_df, tasks_df):
    """merge (left join) checkpoints with task df through job and task id

    Parameters
    ----------
    checkpoints_df
        application checkpoints dataframe to merge

    tasks_df
        tasks dataframe to merge

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Use left join on taskId and jobId

    check_task_df = checkpoints_df.merge(tasks_df,
                                     on=['taskId', 'jobId'], how='left')
    return (check_task_df)

def clean_check_task(check_task_df):
    """Removes uneeded ids for merged application checkpoints and tasks df 
    and fixes timestamp format to datetime

    Parameters
    ----------
    check_task_df
         merged application checkpoints and tasks df to clean

    Returns
    -------
    pandas.core.frame.DataFrame
        Cleaned GPU dataframe

    """

    # Drop uneeded ids

    check_task_df.drop(columns= ['jobId', 'taskId'], inplace=True)
    check_task_df['timestamp'] = timestamp_conv(check_task_df['timestamp'])

    return(check_task_df)

gpu_df = pd.read_csv(GPU_CSV_FILE)
checkpoints_df = pd.read_csv(CHECK_CSV_FILE)
tasks_df = pd.read_csv(TASK_CSV_FILE)

gpu_df = clean_gpu(gpu_df)
check_task_df = merge_check_task(checkpoints_df, tasks_df)
check_task_df = clean_check_task(check_task_df)

In [17]:
check_task_df_start = check_task_df[check_task_df['eventType'] == 'START'].copy()
check_task_df_stop = check_task_df[check_task_df['eventType'] == 'STOP'].copy()

check_task_df_start.rename(index=str, columns={"timestamp": "start_time"}, inplace = True)
check_task_df_stop.rename(index=str, columns={"timestamp": "stop_time"}, inplace = True)

check_task_df_stop.drop('eventType', axis = 1, inplace = True)
check_task_df_start.drop('eventType', axis = 1, inplace = True)

In [18]:
check_task_flat_df = pd.merge(
    check_task_df_start, check_task_df_stop,
    on=['hostname', 'eventName', 'x', 'y', 'level'])

In [30]:
df_A = pd.DataFrame({'start_date':['2017-03-27','2017-01-10'],'end_date':['2017-04-20','2017-02-01']})
df_B = pd.DataFrame({'event_date':['2017-01-20','2017-01-27'],'price':[100,200]})

df_A['end_date'] = pd.to_datetime(df_A.end_date)
df_A['start_date'] = pd.to_datetime(df_A.start_date)
df_B['event_date'] = pd.to_datetime(df_B.event_date)

df_A = df_A.assign(key=1)
df_B = df_B.assign(key=1)
df_merge = pd.merge(df_A, df_B, on='key').drop('key',axis=1)

df_merge = df_merge.query('event_date >= start_date and event_date <= end_date')

df_out = df_A.merge(df_merge, on=['start_date','end_date'], how='left').fillna('').drop('key', axis=1)

print(df_out)

  start_date   end_date           event_date price
0 2017-03-27 2017-04-20                           
1 2017-01-10 2017-02-01  2017-01-20 00:00:00   100
2 2017-01-10 2017-02-01  2017-01-27 00:00:00   200


In [19]:
check_task_flat_df_r = check_task_flat_df[
    (check_task_flat_df['start_time'] >= gpu_df['timestamp'][0]) 
    & (check_task_flat_df['stop_time'] <= gpu_df['timestamp'][len(gpu_df)-1])].copy()

len(check_task_flat_df_r)

17252

In [14]:
check_task_flat_df_r.index = pd.IntervalIndex.from_arrays(
    check_task_flat_df_r['start_time'],
    check_task_flat_df_r['stop_time'], 
    closed='both')

gpu_df['eventName'] = gpu_df['timestamp'].apply(
    lambda x : check_task_flat_df_r.iloc[
        check_task_flat_df_r.index.get_loc(x)]['eventName'])

KeyError: ('datetime64[ns]', 'both')

In [25]:
import pandasql as ps
import sqlite3

# connection to sql
conn = sqlite3.connect(':memory:')

# move dataframes to sql
check_task_flat_df_r.to_sql('CheckTask', conn, index=False)
gpu_df.to_sql('Gpu', conn, index=False)

# SQL query
query = '''
SELECT *
FROM Gpu
LEFT JOIN CheckTask ON gpu.hostname = CheckTask.hostname
WHERE gpu.timestamp >= CheckTask.start_time AND gpu.timestamp <= CheckTask.stop_time
'''

newdf = pd.read_sql_query(query, conn)

In [28]:
newdf.head()

Unnamed: 0,timestamp,hostname,gpuUUID,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc,start_time,hostname.1,eventName,x,y,level,stop_time
0,2018-11-08 08:27:11.089000,0d56a730076643d585f77e00d2d8521a00000Q,GPU-d84a1024-9381-c725-3b85-dd7143e64c35,27.18,35,0,0,2018-11-08 08:27:10.606000,0d56a730076643d585f77e00d2d8521a00000Q,TotalRender,156,186,12,2018-11-08 08:27:54.895000
1,2018-11-08 08:27:11.089000,0d56a730076643d585f77e00d2d8521a00000Q,GPU-d84a1024-9381-c725-3b85-dd7143e64c35,27.18,35,0,0,2018-11-08 08:27:10.608000,0d56a730076643d585f77e00d2d8521a00000Q,Render,156,186,12,2018-11-08 08:27:53.796000
2,2018-11-08 08:27:10.949000,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,GPU-8792a29c-529e-1837-1806-c669cd9b1960,42.44,41,0,0,2018-11-08 08:27:10.839000,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,Uploading,200,23,12,2018-11-08 08:27:11.893000
3,2018-11-08 08:27:10.949000,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,GPU-8792a29c-529e-1837-1806-c669cd9b1960,42.44,41,0,0,2018-11-08 08:27:10.846000,83ea61ac1ef54f27a3bf7bd0f41ecaa700000J,Tiling,200,23,12,2018-11-08 08:27:11.882000
4,2018-11-08 08:27:10.760000,b9a1fa7ae2f74eb68f25f607980f97d700000H,GPU-38bbf3b6-80fb-7e6f-6678-ee45035507ab,42.21,37,0,0,2018-11-08 08:27:10.612000,b9a1fa7ae2f74eb68f25f607980f97d700000H,TotalRender,160,14,12,2018-11-08 08:27:56.265000


In [29]:
check_task_flat_df_r
gpu_df

df_merge = pd.merge(check_task_flat_df_r, gpu_df, on='hostname')

df_merge = df_merge.query('timestamp >= start_time and timestamp <= stop_time')

df_out = df_A.merge(df_merge, on=['start_date','end_date'], how='left').dropna()

df_out.head()

MemoryError: 

In [75]:
len(timestamp_df)

660400

In [76]:
# set keys as indexes for join 
    
#gpu_df.set_index('timestamp', inplace=True)
#check_task_df.set_index('timestamp', inplace=True)
    
# sort by index

#gpu_df.sort_index(inplace=True)
#check_task_df.sort_index(inplace=True)

gpu_df.sort_values(by=['timestamp'], inplace=True)
check_task_df.sort_values(by=['timestamp'], inplace=True)

# Make timestamp df

timestamp_df = check_task_df.copy()
timestamp_df.drop(['hostname', 'eventName', 'eventType', 'x', 'y', 'level'], axis=1, inplace= True)

timestamp_df = pd.merge_asof(timestamp_df,gpu_df,
                       left_on = ['timestamp'], right_on = ['timestamp'],
                       tolerance = pd.Timedelta('4ms'), direction = 'nearest')

In [77]:
len(timestamp_df)

660400

In [78]:
timestamp_df.head()

Unnamed: 0,timestamp,hostname,gpuUUID,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc
0,2018-11-08 07:41:30.957,e7adc42d28814e518e9601ac2329c51300000G,GPU-4a960fa3-73dd-9878-0ccc-b8ce6637c113,26.15,28.0,0.0,0.0
1,2018-11-08 07:41:30.957,e7adc42d28814e518e9601ac2329c51300000G,GPU-4a960fa3-73dd-9878-0ccc-b8ce6637c113,26.15,28.0,0.0,0.0
2,2018-11-08 07:41:30.960,e7adc42d28814e518e9601ac2329c51300000T,GPU-21587c35-7734-7942-6642-c8b1e768d7cf,27.28,29.0,0.0,0.0
3,2018-11-08 07:41:30.960,e7adc42d28814e518e9601ac2329c51300000T,GPU-21587c35-7734-7942-6642-c8b1e768d7cf,27.28,29.0,0.0,0.0
4,2018-11-08 07:41:30.962,e7adc42d28814e518e9601ac2329c51300000T,GPU-21587c35-7734-7942-6642-c8b1e768d7cf,27.28,29.0,0.0,0.0


In [79]:
timestamp_df.isnull().sum()

timestamp             0
hostname          51464
gpuUUID           51464
powerDrawWatt     51464
gpuTempC          51464
gpuUtilPerc       51464
gpuMemUtilPerc    51464
dtype: int64

In [82]:
timestamp_df.dropna(inplace = True)

In [83]:
timestamp_df.isnull().sum()

timestamp         0
hostname          0
gpuUUID           0
powerDrawWatt     0
gpuTempC          0
gpuUtilPerc       0
gpuMemUtilPerc    0
dtype: int64

In [84]:
len(timestamp_df)

608936

In [85]:
check_task_gpu_df = pd.merge(timestamp_df, check_task_df, on = ['hostname', 'timestamp'])

In [88]:
check_task_gpu_df.isnull().sum()

timestamp         0
hostname          0
gpuUUID           0
powerDrawWatt     0
gpuTempC          0
gpuUtilPerc       0
gpuMemUtilPerc    0
eventName         0
eventType         0
x                 0
y                 0
level             0
dtype: int64

In [89]:
len(check_task_gpu_df)

1409

In [59]:
timestamp_df.head()

2018-11-08 07:41:30.957
2018-11-08 07:41:30.957
2018-11-08 07:41:30.960
2018-11-08 07:41:30.960
2018-11-08 07:41:30.962


In [60]:
check_task_df.head()

Unnamed: 0_level_0,hostname,eventName,eventType,x,y,level
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-11-08 07:41:30.957,95b4ae6d890e4c46986d91d7ac4bf08200000Q,TotalRender,START,128,5,12
2018-11-08 07:41:30.957,95b4ae6d890e4c46986d91d7ac4bf08200000Q,Saving Config,START,128,5,12
2018-11-08 07:41:30.960,95b4ae6d890e4c46986d91d7ac4bf08200000Q,Render,START,128,5,12
2018-11-08 07:41:30.960,95b4ae6d890e4c46986d91d7ac4bf08200000Q,Saving Config,STOP,128,5,12
2018-11-08 07:41:30.962,95b4ae6d890e4c46986d91d7ac4bf08200000D,TotalRender,START,128,9,12


In [61]:
check_task_gpu_df.head()

Unnamed: 0_level_0,hostname,gpuUUID,powerDrawWatt,gpuTempC,gpuUtilPerc,gpuMemUtilPerc,eventName,eventType,x,y,level
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-11-08 07:41:31.313,4ad946d4435c42dabb5073531ea4f315000003,GPU-ca5f003a-4b66-f1b9-8cad-7899f6555909,26.6,28,0,0,TotalRender,START,87,3,12
2018-11-08 07:41:31.313,4ad946d4435c42dabb5073531ea4f315000003,GPU-ca5f003a-4b66-f1b9-8cad-7899f6555909,26.6,28,0,0,Saving Config,START,87,3,12
2018-11-08 07:41:31.794,4c72fae95b9147189a0559269a6953ff00001A,GPU-612ee29b-3d29-c9eb-e679-f59934f76713,24.86,30,0,0,TotalRender,START,180,7,12
2018-11-08 07:41:31.794,4c72fae95b9147189a0559269a6953ff00001A,GPU-612ee29b-3d29-c9eb-e679-f59934f76713,24.86,30,0,0,Saving Config,START,180,7,12
2018-11-08 07:41:31.834,4c72fae95b9147189a0559269a6953ff00000A,GPU-feeb9f98-518f-fa06-12ab-d8df1b597944,25.94,27,0,0,Render,START,70,7,12


In [62]:
check_task_gpu_df.isnull().sum()

hostname          0
gpuUUID           0
powerDrawWatt     0
gpuTempC          0
gpuUtilPerc       0
gpuMemUtilPerc    0
eventName         0
eventType         0
x                 0
y                 0
level             0
dtype: int64

In [63]:
len(check_task_gpu_df)

337