In [1]:
import ray
import pandas as pd
import time 
import numpy as np
import binascii
import redis
import pprint
pp = pprint.PrettyPrinter() # for printing dicts and lists in a manner easy for the eyes
from misc import *


In [69]:
ray.init(num_cpus=3)

Waiting for redis server at 127.0.0.1:29955 to respond...
Waiting for redis server at 127.0.0.1:27574 to respond...
Starting local scheduler with 3 CPUs and 0 GPUs.


AssertionError: Perhaps you called ray.init twice by accident?

### Below, we define some example functions for populating the redis database

In [3]:
# Function definitions...

@ray.remote
def example(x):
    time.sleep(np.random.random())
    return np.random.randn()

@ray.remote
class TestCls():
    def __init__(self):
        self.g = 1
        
    def to_go(self, x):
        return x

    
@ray.remote
class Outer():
    def __init__(self):
        self.f = 1
        self.test = TestCls.remote()
    
    def to_go2(self, x):
        return x * 2
    
    def error(self):
        return 1/0

In [4]:
# Here, we generate data in redis for remote tasks
results = ray.get([example.remote(x) for x in range(4)])

# Generating data for Actor tasks
actor = TestCls.remote()
actor_results = ray.get([actor.to_go.remote(1)])

err_actor = Outer.remote()
err_actor.error.remote()

ObjectID(d1c679494caaaf7ad86fe68d7592a599890aee7e)

In [40]:
global_state = ray.global_state # Used to access redis client, but also has nice interface for certain information 
addr, port = ray.worker.global_worker.redis_address.split(":")
rc = redis.StrictRedis(host=addr, port=port, decode_responses=False)
# rc_non = global_state.redis_client # Redis Client for interacting with redis without decoding

From checking the values of the redis store, we can identify some key things included via `rc.keys()`:
 - Event log
 - worker info
 - Remote functions
 - "Functions to run"
 - Drivers
 - Redis clients
 - Actor classes
 - Actors

In [41]:
# for k in rc.keys():
#     try:
#         print(k)
#         pp.pprint(rc.hgetall(k))
# #         rc.hgetall(k) 
#     except Exception:
#         print(k, "Failed ")
# # Certain return values, such as `event_log:*`, can only be accessed via list calls to redis

#         try:
#             print('#' * 10)
#             print(k)
#             pp.pprint(rc.lrange(k, 0, -1))
#         except Exception:
#             print(k, "Failed ")
            

## Actor Info


# TODO: Fix actor_id

In [42]:
# We can populate a DataFrame with Actor 
actor_list = []
for a_key in rc.keys("Actor:*"):
    v = rc.hgetall(a_key)
    v['actor_id'] = a_key
    v['class_id'] = hex_identifier(v['class_id'])
    actor_list.append(v)

actor_df = pd.DataFrame(actor_list)

KeyError: 'class_id'

In [43]:
actor_df

NameError: name 'actor_df' is not defined

## Actor Classes Info

In [44]:
actor_classes = []
for a_key in rc.keys("ActorClass:*"):
    
    v = rc.hgetall(a_key)
    del v['class'] # removed pickle hex for readability
    v['driver_id'] = hex_identifier(v['driver_id'])
    class_id = a_key.split(b':')[1]
    v['class_id'] = hex_identifier(class_id)
    actor_classes.append(v)
    
actor_class_df = pd.DataFrame(actor_classes)

KeyError: 'class'

In [45]:
actor_class_df

NameError: name 'actor_class_df' is not defined

## Remote Functions Info


In [46]:
fn_table = global_state.function_table()
fn_list = []
for fn_id in fn_table:
    val = fn_table[fn_id]
    val["function_id"] = fn_id
    fn_list.append(val)
pd.DataFrame(fn_list)

Unnamed: 0,DriverID,Module,Name,function_id
0,88ea2c0fb70be90b496dd544aa50185051242f8e,__main__,__main__.example,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1
1,88ea2c0fb70be90b496dd544aa50185051242f8e,__main__,__main__.f,3feec2358f93c5e184bbe954fb81a5a130c298ad


## Node Info

In [70]:
# Using the global state API, we can populate a DataFrame with a list of Redis Clients currently connected
ctable = global_state.client_table()

client_list = []
for node_ip in ctable:
    for client in ctable[node_ip]:
        client["node_ip_address"] = node_ip
        client_list.append(client)

client_df = pd.DataFrame(client_list)
client_df

Unnamed: 0,AuxAddress,ClientType,DBClientID,Deleted,LocalSchedulerSocketName,NumCPUs,NumGPUs,node_ip_address
0,,global_scheduler,baf95e0b640255904566afdccdc86284a7b9ef8d,False,,,,127.0.0.1
1,,plasma_manager,9b252fdbc0750026f8afa7564cab707ed4f2da96,False,,,,127.0.0.1
2,127.0.0.1:43164,local_scheduler,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,False,/tmp/scheduler58147313,3.0,0.0,127.0.0.1


In [48]:
# We can populate a DataFrame with a list of objects in the object store
object_dict = {oid.hex(): v for oid, v in global_state.object_table().items()}
object_df = pd.DataFrame(object_dict).transpose()
object_df
# May need to change this so ID is not index

Unnamed: 0,IsPut,ManagerIDs,TaskID
21fcf4100bacb850d98bc0fbf6d9969a07b6f02a,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],20fcf4100bacb850d98bc0fbf6d9969a07b6f02a
29eaaf2046ea8d43ec105f3bc86c378d2be0db6e,True,[9b252fdbc0750026f8afa7564cab707ed4f2da96],d61550dfb91572bcec105f3bc86c378d2be0db6e
363138810f9b396ee35fbdb99bcae067cfdcc403,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],373138810f9b396ee35fbdb99bcae067cfdcc403
5f6a6dbc6e97d54cbc630cb84f77a18b19c3398c,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],5e6a6dbc6e97d54cbc630cb84f77a18b19c3398c
6da84a708ba837a5df0db1fb873a2d8bb28641df,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],6ca84a708ba837a5df0db1fb873a2d8bb28641df
76a04fcd02c077c937b0e89c48db6aae40e40f8e,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],77a04fcd02c077c937b0e89c48db6aae40e40f8e
7feff3ca891833ad1c8aeb53bbb834458c92c8ef,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],7eeff3ca891833ad1c8aeb53bbb834458c92c8ef
a4c67496deda1dc8a86ec2a92eda68343c50e90b,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],a5c67496deda1dc8a86ec2a92eda68343c50e90b
ce7d34e4facd99629f87cd1abe9ea6de2644b0dd,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],cf7d34e4facd99629f87cd1abe9ea6de2644b0dd
d1c679494caaaf7ad86fe68d7592a599890aee7e,False,[9b252fdbc0750026f8afa7564cab707ed4f2da96],d0c679494caaaf7ad86fe68d7592a599890aee7e


## Task Info

In [49]:
from pandas.io.json import json_normalize

tt = global_state.task_table()
tt_list = list(tt.values())

for d in tt_list:
    d['TaskSpec']['ReturnObjectIDs'] = [oid.hex() for oid in d['TaskSpec']['ReturnObjectIDs']]

task_df = json_normalize(tt_list)

task_df

Unnamed: 0,LocalSchedulerID,State,TaskSpec.ActorCounter,TaskSpec.ActorID,TaskSpec.Args,TaskSpec.DriverID,TaskSpec.FunctionID,TaskSpec.ParentCounter,TaskSpec.ParentTaskID,TaskSpec.RequiredResources.CPUs,TaskSpec.RequiredResources.GPUs,TaskSpec.ReturnObjectIDs,TaskSpec.TaskID
0,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,4,ffffffffffffffffffffffffffffffffffffffff,[],88ea2c0fb70be90b496dd544aa50185051242f8e,3feec2358f93c5e184bbe954fb81a5a130c298ad,8,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[ce7d34e4facd99629f87cd1abe9ea6de2644b0dd],cf7d34e4facd99629f87cd1abe9ea6de2644b0dd
1,ffffffffffffffffffffffffffffffffffffffff,8,0,ffffffffffffffffffffffffffffffffffffffff,[],88ea2c0fb70be90b496dd544aa50185051242f8e,ffffffffffffffffffffffffffffffffffffffff,0,736cf1fff882c01d4f78577d3c75d6d6e50856b1,0.0,0.0,[],d61550dfb91572bcec105f3bc86c378d2be0db6e
2,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,3,ffffffffffffffffffffffffffffffffffffffff,[3],88ea2c0fb70be90b496dd544aa50185051242f8e,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,3,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[21fcf4100bacb850d98bc0fbf6d9969a07b6f02a],20fcf4100bacb850d98bc0fbf6d9969a07b6f02a
3,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,0,627c5216eef315e715c7143e08895e62f3a99e58,[],88ea2c0fb70be90b496dd544aa50185051242f8e,1939f247b3ae06244a8d50fa7dabbcc975c6fc4c,4,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[5f6a6dbc6e97d54cbc630cb84f77a18b19c3398c],5e6a6dbc6e97d54cbc630cb84f77a18b19c3398c
4,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,1,ffffffffffffffffffffffffffffffffffffffff,[1],88ea2c0fb70be90b496dd544aa50185051242f8e,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,1,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[6da84a708ba837a5df0db1fb873a2d8bb28641df],6ca84a708ba837a5df0db1fb873a2d8bb28641df
5,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,1,627c5216eef315e715c7143e08895e62f3a99e58,[1],88ea2c0fb70be90b496dd544aa50185051242f8e,7916fc26bb3c69a23ba4e6714b02a2c07e8542f2,5,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[a4c67496deda1dc8a86ec2a92eda68343c50e90b],a5c67496deda1dc8a86ec2a92eda68343c50e90b
6,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,1,b967fd1ad547ef25e7778298c1639f87837b2cf8,[],88ea2c0fb70be90b496dd544aa50185051242f8e,11f9578d05e6f7bb58a3cdd00107e9f4e3882671,7,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[d1c679494caaaf7ad86fe68d7592a599890aee7e],d0c679494caaaf7ad86fe68d7592a599890aee7e
7,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,0,ffffffffffffffffffffffffffffffffffffffff,[0],88ea2c0fb70be90b496dd544aa50185051242f8e,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,0,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[363138810f9b396ee35fbdb99bcae067cfdcc403],373138810f9b396ee35fbdb99bcae067cfdcc403
8,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,0,b967fd1ad547ef25e7778298c1639f87837b2cf8,[],88ea2c0fb70be90b496dd544aa50185051242f8e,1939f247b3ae06244a8d50fa7dabbcc975c6fc4c,6,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[e9b2fdcc3204e48d662a66478cd913819c402404],e8b2fdcc3204e48d662a66478cd913819c402404
9,0f0d95ef6207755fb9fac1e0d1fd480636eace3f,16,2,ffffffffffffffffffffffffffffffffffffffff,[2],88ea2c0fb70be90b496dd544aa50185051242f8e,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,2,d61550dfb91572bcec105f3bc86c378d2be0db6e,1.0,0.0,[76a04fcd02c077c937b0e89c48db6aae40e40f8e],77a04fcd02c077c937b0e89c48db6aae40e40f8e


We can also parse the event logs in order to get fine grained timing for remote tasks. However, as a user, I'd probably only care about time taken in running the task -- this can be much refined.

In [56]:
import json 
event_list = []

# Get and decode all task timing/event logs
for key in rc.keys("event_log*"):
    content = rc.lrange(key, 0, -1)
    event_list.append(json.loads(content[0])) 


In [51]:
from collections import defaultdict

# This seems to be the best way to do the event_log -> dataframe pipeline. 
# First generate a (key, [value]) mapping for all tasks and then apply some pandas operations to convert.

# event_dict is used to store timing info
event_dict = defaultdict(lambda: np.full(len(event_list), np.nan))

# info_dict is used to store meta data - such as function names and task id
info_dict = defaultdict(lambda: [None] * len(event_list))

for i, task_event in enumerate(event_list):
    for event in (task_event):
        time, label, startstop, info = event
        event_dict[(label, startstop)][i] = time
        if info:
            for k in info:
                info_dict[k][i] = info[k]

## Event Timing

In [52]:
# The tuple keys for `event_dict` produce a hierarchical index, which could be useful. However, joining or merging it 
# with other non-hierarchical indices will throw away this structure.

edf = pd.DataFrame(event_dict) 
edf.rename(columns={1: 'start', 2:'end'}, inplace=True)
edf

Unnamed: 0_level_0,ray:acquire_lock,ray:acquire_lock,ray:get_task,ray:get_task,ray:import_remote_function,ray:import_remote_function,ray:submit_task,ray:submit_task,ray:task,ray:task,ray:task:execute,ray:task:execute,ray:task:get_arguments,ray:task:get_arguments,ray:task:store_outputs,ray:task:store_outputs,ray:wait_for_function,ray:wait_for_function
Unnamed: 0_level_1,start,end,start,end,start,end,start,end,start,end,start,end,start,end,start,end,start,end
0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,1497487000.0,1497487000.0
1,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
2,1497488000.0,1497488000.0,1497487000.0,1497488000.0,1497488000.0,1497488000.0,,,1497488000.0,1497488000.0,1497488000.0,1497488000.0,1497488000.0,1497488000.0,1497488000.0,1497488000.0,1497488000.0,1497488000.0
3,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
4,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
5,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
6,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
7,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
8,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0
9,1497487000.0,1497487000.0,1497487000.0,1497487000.0,,,,,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0,1497487000.0


## Event Info

In [53]:
idf = pd.DataFrame(info_dict)
# idf.columns = pd.MultiIndex.from_tuples([(c, '') for c in idf]) # this is non-idempotent!
idf

Unnamed: 0,function_name,task_id,traceback,type,value,worker_id
0,error,d0c679494caaaf7ad86fe68d7592a599890aee7e,"Traceback (most recent call last):\n File ""/U...",<class 'ZeroDivisionError'>,division by zero,2b3b586cc0fefa78dae19ccfa97cc049c2b971c9
1,__init__,5e6a6dbc6e97d54cbc630cb84f77a18b19c3398c,,,,928f3c725d5ec9aae986ad4e983106cd687b0f99
2,__main__.f,cf7d34e4facd99629f87cd1abe9ea6de2644b0dd,,,,000ef36bd1f6c04ea29bef9be966166217269aa4
3,__init__,e8b2fdcc3204e48d662a66478cd913819c402404,,,,2b3b586cc0fefa78dae19ccfa97cc049c2b971c9
4,to_go,a5c67496deda1dc8a86ec2a92eda68343c50e90b,,,,928f3c725d5ec9aae986ad4e983106cd687b0f99
5,__main__.example,20fcf4100bacb850d98bc0fbf6d9969a07b6f02a,,,,000ef36bd1f6c04ea29bef9be966166217269aa4
6,__main__.example,373138810f9b396ee35fbdb99bcae067cfdcc403,,,,000ef36bd1f6c04ea29bef9be966166217269aa4
7,__main__.example,77a04fcd02c077c937b0e89c48db6aae40e40f8e,,,,532f6ebeb3cde15e3a4ddae13abe9b93539df9e8
8,__main__.example,6ca84a708ba837a5df0db1fb873a2d8bb28641df,,,,3ce6ba538ae2cfabb88d094fa1455dff316cc449
9,__init__,7eeff3ca891833ad1c8aeb53bbb834458c92c8ef,,,,bd05f8b2efe98d5e3c18da44992d44dc734e99bb


In [54]:
remote_table = pd.concat([idf, edf], axis=1)

In [55]:
# Example to get Error, function_name, IP address, Actor ID, ParentID

errors = idf[idf.traceback.notnull()]
error_task = errors.merge(task_df, left_on="task_id", right_on="TaskSpec.TaskID")
err_task_loc = error_task.merge(client_df, left_on="LocalSchedulerID", right_on="DBClientID")
err_task_loc[['function_name', 'traceback', 'value', 'TaskSpec.ActorID',
              'TaskSpec.ParentTaskID',  'node_ip_address',  'task_id',]]

Unnamed: 0,function_name,traceback,value,TaskSpec.ActorID,TaskSpec.ParentTaskID,node_ip_address,task_id
0,error,"Traceback (most recent call last):\n File ""/U...",division by zero,b967fd1ad547ef25e7778298c1639f87837b2cf8,d61550dfb91572bcec105f3bc86c378d2be0db6e,127.0.0.1,d0c679494caaaf7ad86fe68d7592a599890aee7e


In [67]:
object_table = ray.global_state.object_table()
location_to_objects = dict()

for object_id, object_descriptor in object_table.items():
        for location in object_descriptor["ManagerIDs"]:
            if location not in location_to_objects:
                location_to_objects[location] = []
            location_to_objects[location].append(object_id)
print(location_to_objects)
table = pd.DataFrame.from_dict(location_to_objects)
table

{'9b252fdbc0750026f8afa7564cab707ed4f2da96': [ObjectID(2ceaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(328a0a1892dd6e7e7971a46fc076546c889c7e27), ObjectID(883bda79483f5aaa6d45cb7fb97ba4aaa687717b), ObjectID(21fcf4100bacb850d98bc0fbf6d9969a07b6f02a), ObjectID(a4c67496deda1dc8a86ec2a92eda68343c50e90b), ObjectID(29eaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(7feff3ca891833ad1c8aeb53bbb834458c92c8ef), ObjectID(2feaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(d1c679494caaaf7ad86fe68d7592a599890aee7e), ObjectID(2aeaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(ce7d34e4facd99629f87cd1abe9ea6de2644b0dd), ObjectID(61a19af79b41e3fca9748669d0ef8705e3aba673), ObjectID(e9b2fdcc3204e48d662a66478cd913819c402404), ObjectID(23eaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(6b4464b294ac41c0c0740bbb8cbd78bd7b0c7292), ObjectID(28eaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(2eeaaf2046ea8d43ec105f3bc86c378d2be0db6e), ObjectID(76a04fcd02c077c937b0e89c48db6aae40e40f8e), ObjectID(2deaaf204

Unnamed: 0,9b252fdbc0750026f8afa7564cab707ed4f2da96
0,ObjectID(2ceaaf2046ea8d43ec105f3bc86c378d2be0d...
1,ObjectID(328a0a1892dd6e7e7971a46fc076546c889c7...
2,ObjectID(883bda79483f5aaa6d45cb7fb97ba4aaa6877...
3,ObjectID(21fcf4100bacb850d98bc0fbf6d9969a07b6f...
4,ObjectID(a4c67496deda1dc8a86ec2a92eda68343c50e...
5,ObjectID(29eaaf2046ea8d43ec105f3bc86c378d2be0d...
6,ObjectID(7feff3ca891833ad1c8aeb53bbb834458c92c...
7,ObjectID(2feaaf2046ea8d43ec105f3bc86c378d2be0d...
8,ObjectID(d1c679494caaaf7ad86fe68d7592a599890ae...
9,ObjectID(2aeaaf2046ea8d43ec105f3bc86c378d2be0d...


# TODO:

- Error messages logging (specifics - which node, which function call, which actor, what time)
  - Getting Error messages from Redis is redundant because we already get info in the event_log. Error messages provide extra information such as `error_id` and `type`, which don't seem particularly useful.
  - ~Create table for workers (Id, socket info, node IP address) ... are workers even still a proper abstraction~
  - Get multinode setting - test out client table
  - !! Write out an example for error tracing