In [2]:
import ray
import pandas as pd
import time 
import numpy as np
import binascii
import redis
import pprint
pp = pprint.PrettyPrinter() # for printing dicts and lists in a manner easy for the eyes



In [3]:
IDENTIFIER_LENGTH = 20

# This prefix must match the value defined in ray_redis_module.cc.

def hex_identifier(identifier):
    return binascii.hexlify(identifier).decode()


def identifier(hex_identifier):
    return binascii.unhexlify(hex_identifier)

def key_to_hex_identifiers(key):
    # Extract worker_id and task_id from key of the form
    # prefix:worker_id:task_id.
    offset = key.index(b":") + 1
    worker_id = hex_identifier(key[offset:(offset + IDENTIFIER_LENGTH)])
    offset += IDENTIFIER_LENGTH + 1
    task_id = hex_identifier(key[offset:(offset + IDENTIFIER_LENGTH)])
    return worker_id, task_id

import re
def clean(sometext):
    sometext = sometext.decode('UTF-8')
    ansi_escape = re.compile(r'\x1b[^m]*m')
    return ansi_escape.sub('', sometext)

In [4]:
ray.init(num_cpus=3)

Waiting for redis server at 127.0.0.1:37640 to respond...
Waiting for redis server at 127.0.0.1:55376 to respond...
Starting local scheduler with 3 CPUs and 0 GPUs.


{'local_scheduler_socket_names': ['/tmp/scheduler28043041'],
 'node_ip_address': '127.0.0.1',
 'object_store_addresses': [ObjectStoreAddress(name='/tmp/plasma_store82189833', manager_name='/tmp/plasma_manager67976233', manager_port=41335)],
 'redis_address': '127.0.0.1:37640'}

In [5]:
# Function definitions...

@ray.remote
def example(x):
    time.sleep(np.random.random())
    return np.random.randn()

@ray.remote
class TestCls():
    def __init__(self):
        self.g = 1
        
    def to_go(self, x):
        return x

    
@ray.remote
class Outer():
    def __init__(self):
        self.f = 1
        self.test = TestCls.remote()
    
    def to_go2(self, x):
        return x * 2
    
    def error(self):
        return 1/0

In [6]:
# Here, we generate data in redis for remote tasks
results = ray.get([example.remote(x) for x in range(4)])

# Generating data for Actor tasks
actor = TestCls.remote()
actor_results = ray.get([actor.to_go.remote(1)])

err_actor = Outer.remote()
err_actor.error.remote()

ObjectID(b52caffedfbeda916ce70d1cb94a20f0ad0f2df3)

Remote function error failed with:

Traceback (most recent call last):
  File "/Users/rliaw/.local/lib/python3.5/site-packages/ray-0.1.1-py3.5-macosx-10.7-x86_64.egg/ray/worker.py", line 1735, in process_task
    worker.actors[task.actor_id().id()], *arguments)
  File "<ipython-input-5-7765713b18c9>", line 27, in error
ZeroDivisionError: division by zero


You can inspect errors by running

    ray.error_info()

If this driver is hanging, start a new one with

    ray.init(redis_address="127.0.0.1:37640")



In [7]:
global_state = ray.worker.global_state # Used to access redis client, but also has nice interface for certain information 
gworker = ray.worker.global_worker # Unused for now
rc_non = global_state.redis_client # Redis Client for interacting with redis without decoding
addr, port = gworker.redis_address.split(":")
rc = redis.StrictRedis(host=addr, port=port, decode_responses=True)

Key things included in redis from `rc.keys()`:
 - Event log
 - worker info
 - Remote functions
 - "Functions to run"
 - Drivers
 - Redis clients
 - Actor classes
 - Actors

In [8]:

# for k in rc.keys():
#     try:
#         print(k)
#         pp.pprint(rc.hgetall(k))
# #         rc.hgetall(k) 
#     except Exception:
#         print(k, "Failed ")
# # Certain return values, such as `event_log:*`, can only be accessed via list calls to redis

#         try:
#             print('#' * 10)
#             print(k)
#             pp.pprint(rc.lrange(k, 0, -1))
#         except Exception:
#             print(k, "Failed ")
            

## Actor Info

NOTE: 
# TODO: Fix actor_id

In [41]:
# import unicode 
actor_list = []
for a_key in rc.keys("Actor:*"):
    v = rc.hgetall(a_key)
    v['actor_id'] = a_key
    v['class_id'] = hex_identifier(v['class_id'])
    actor_list.append(v)

actor_df = pd.DataFrame(actor_list)

In [42]:
actor_df

Unnamed: 0,actor_id,class_id,num_gpus
0,Actor:b'\x15}0\xef\xfd\xb7\x8de\xe6g\x96\xf1\x...,2b94228a597a90690a98967df6710f3ba49e75a1,0
1,Actor:b'\x0c\x97\xf1\xcd\xed\x88b>\x1a\xc6\xb4...,2b94228a597a90690a98967df6710f3ba49e75a1,0
2,Actor:b'r{w\x92\xeb`\xac\xf3\xf8\xa16\x1b\xe0\...,de5f3efcf1ff121fe58c795f8d6c3171ab0145c3,0


In [43]:
actor_classes = []
for a_key in rc.keys("ActorClass:*"):
    
    v = rc.hgetall(a_key)
    del v['class'] # removed pickle hex for readability
    v['driver_id'] = hex_identifier(v['driver_id'])
    class_id = a_key.split(b':')[1]
    v['class_id'] = hex_identifier(class_id)
    actor_classes.append(v)
    
actor_class_df = pd.DataFrame(actor_classes)

In [44]:
actor_class_df

Unnamed: 0,actor_method_names,class_id,class_name,driver_id,module
0,"[""__ray_terminate__"", ""to_go2"", ""__init__"", ""e...",de5f3efcf1ff121fe58c795f8d6c3171ab0145c3,Class,4cced4a983a1db28093902781a350cbc87dbdd3c,ray.actor
1,"[""__ray_terminate__"", ""__init__"", ""to_go""]",2b94228a597a90690a98967df6710f3ba49e75a1,Class,4cced4a983a1db28093902781a350cbc87dbdd3c,ray.actor


## Remote Functions Info

In [13]:
fn_table = global_state.function_table()
fn_list = []
for fn_id in fn_table:
    val = fn_table[fn_id]
    val["function_id"] = fn_id
    fn_list.append(val)
pd.DataFrame(fn_list)

Unnamed: 0,DriverID,Module,Name,function_id
0,4cced4a983a1db28093902781a350cbc87dbdd3c,__main__,__main__.example,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1


## Node Info

In [14]:
ctable = global_state.client_table()

client_list = []
for node_ip in ctable:
    for client in ctable[node_ip]:
        client["node_ip_address"] = node_ip
        client_list.append(client)

client_df = pd.DataFrame(client_list)
client_df

Unnamed: 0,AuxAddress,ClientType,DBClientID,Deleted,LocalSchedulerSocketName,NumCPUs,NumGPUs,node_ip_address
0,,global_scheduler,b027767517536fffe594c960fb6be1c8b1fb7ac1,False,,,,127.0.0.1
1,,plasma_manager,cb08ea5abba62531352503caaddfbc6e684d5eea,False,,,,127.0.0.1
2,127.0.0.1:41335,local_scheduler,8be080d1669a8ebb4d2ba00be74a700af726fa8c,False,/tmp/scheduler28043041,3.0,0.0,127.0.0.1


In [15]:
object_dict = {oid.hex(): v for oid, v in global_state.object_table().items()}
object_df = pd.DataFrame(object_dict).transpose()
object_df
# May need to change this so ID is not index

Unnamed: 0,IsPut,ManagerIDs,TaskID
1b985e2029f61fc87a27435b8a0b4af31cc72497,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],1a985e2029f61fc87a27435b8a0b4af31cc72497
3396b98523df3487ed2f42ce6ccc6e649b13f175,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],3296b98523df3487ed2f42ce6ccc6e649b13f175
5520a0c666de001163019018e32fe8eede869c3b,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],5420a0c666de001163019018e32fe8eede869c3b
6fb682c27d3d5366a3e4fe259052f5a149a32e8a,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],6eb682c27d3d5366a3e4fe259052f5a149a32e8a
8dd1a4f9cbda6a7d865c74141383232a67c86c8f,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],8cd1a4f9cbda6a7d865c74141383232a67c86c8f
9b135aaaa66d21bf62c802fa120512f24478b227,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],9a135aaaa66d21bf62c802fa120512f24478b227
b52caffedfbeda916ce70d1cb94a20f0ad0f2df3,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],b42caffedfbeda916ce70d1cb94a20f0ad0f2df3
b8f30131d06ae2e546d2f777169960d24c63712a,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],b9f30131d06ae2e546d2f777169960d24c63712a
e024027d4dca1014dd25bafaa0945eafed4d9464,False,[cb08ea5abba62531352503caaddfbc6e684d5eea],e124027d4dca1014dd25bafaa0945eafed4d9464


# Tasks

In [38]:
tt = global_state.task_table()
tt_list = list(tt.values())

for d in tt_list:
    d['TaskSpec']['ReturnObjectIDs'] = [oid.hex() for oid in d['TaskSpec']['ReturnObjectIDs']]

In [39]:
from pandas.io.json import json_normalize

task_df = json_normalize(tt_list)

task_df

Unnamed: 0,LocalSchedulerID,State,TaskSpec.ActorCounter,TaskSpec.ActorID,TaskSpec.Args,TaskSpec.DriverID,TaskSpec.FunctionID,TaskSpec.ParentCounter,TaskSpec.ParentTaskID,TaskSpec.RequiredResources.CPUs,TaskSpec.RequiredResources.GPUs,TaskSpec.ReturnObjectIDs,TaskSpec.TaskID
0,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,3,ffffffffffffffffffffffffffffffffffffffff,[3],4cced4a983a1db28093902781a350cbc87dbdd3c,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,3,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[9b135aaaa66d21bf62c802fa120512f24478b227],9a135aaaa66d21bf62c802fa120512f24478b227
1,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,0,727b7792eb60acf3f8a1361be0f54babb5419413,[],4cced4a983a1db28093902781a350cbc87dbdd3c,1939f247b3ae06244a8d50fa7dabbcc975c6fc4c,6,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[8dd1a4f9cbda6a7d865c74141383232a67c86c8f],8cd1a4f9cbda6a7d865c74141383232a67c86c8f
2,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,1,0c97f1cded88623e1ac6b4dfe6f159ea77cc7814,[1],4cced4a983a1db28093902781a350cbc87dbdd3c,7916fc26bb3c69a23ba4e6714b02a2c07e8542f2,5,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[1b985e2029f61fc87a27435b8a0b4af31cc72497],1a985e2029f61fc87a27435b8a0b4af31cc72497
3,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,2,ffffffffffffffffffffffffffffffffffffffff,[2],4cced4a983a1db28093902781a350cbc87dbdd3c,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,2,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[3396b98523df3487ed2f42ce6ccc6e649b13f175],3296b98523df3487ed2f42ce6ccc6e649b13f175
4,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,1,727b7792eb60acf3f8a1361be0f54babb5419413,[],4cced4a983a1db28093902781a350cbc87dbdd3c,11f9578d05e6f7bb58a3cdd00107e9f4e3882671,7,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[b52caffedfbeda916ce70d1cb94a20f0ad0f2df3],b42caffedfbeda916ce70d1cb94a20f0ad0f2df3
5,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,1,ffffffffffffffffffffffffffffffffffffffff,[1],4cced4a983a1db28093902781a350cbc87dbdd3c,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,1,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[6fb682c27d3d5366a3e4fe259052f5a149a32e8a],6eb682c27d3d5366a3e4fe259052f5a149a32e8a
6,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,0,0c97f1cded88623e1ac6b4dfe6f159ea77cc7814,[],4cced4a983a1db28093902781a350cbc87dbdd3c,1939f247b3ae06244a8d50fa7dabbcc975c6fc4c,4,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[e024027d4dca1014dd25bafaa0945eafed4d9464],e124027d4dca1014dd25bafaa0945eafed4d9464
7,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,0,157d30effdb78d65e66796f1b5ee70d3154a48e9,[],4cced4a983a1db28093902781a350cbc87dbdd3c,1939f247b3ae06244a8d50fa7dabbcc975c6fc4c,0,8cd1a4f9cbda6a7d865c74141383232a67c86c8f,1.0,0.0,[b8f30131d06ae2e546d2f777169960d24c63712a],b9f30131d06ae2e546d2f777169960d24c63712a
8,8be080d1669a8ebb4d2ba00be74a700af726fa8c,16,0,ffffffffffffffffffffffffffffffffffffffff,[0],4cced4a983a1db28093902781a350cbc87dbdd3c,cf1fc498b8a2b4cb79bc2ed5abf074ba96ae69a1,0,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,1.0,0.0,[5520a0c666de001163019018e32fe8eede869c3b],5420a0c666de001163019018e32fe8eede869c3b
9,ffffffffffffffffffffffffffffffffffffffff,8,0,ffffffffffffffffffffffffffffffffffffffff,[],4cced4a983a1db28093902781a350cbc87dbdd3c,ffffffffffffffffffffffffffffffffffffffff,0,b90d431a51074fa45a576ea416f3031f335c676e,0.0,0.0,[],b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf


We can also parse the event logs in order to get fine grained timing for remote tasks. However, as a user, I'd probably only care about time taken in running the task -- this can be much refined.

In [18]:
import json 
event_list = []

# Get and decode all task timing/event logs
for key in rc.keys("event_log*"):
    content = rc.lrange(key, 0, -1)
#     event_list.append(json.loads(content[0].decode())) 
    event_list.append(json.loads(content[0]))
    
# event_list

In [19]:
from collections import defaultdict

# This seems to be the best way to do the event_log -> dataframe pipeline. 
# First generate a (key, [value]) mapping for all tasks and then apply some pandas operations to convert.

# event_dict is used to store timing info
event_dict = defaultdict(lambda: np.full(len(event_list), np.nan))

# info_dict is used to store meta data - such as function names and task id
info_dict = defaultdict(lambda: [None] * len(event_list))

for i, task_event in enumerate(event_list):
    for event in (task_event):
        time, label, startstop, info = event
        event_dict[(label, startstop)][i] = time
        if info:
            for k in info:
                info_dict[k][i] = info[k]

In [20]:
# The tuple keys for `event_dict` produce a hierarchical index, which could be useful. However, joining or merging it 
# with other non-hierarchical indices will throw away this structure.

edf = pd.DataFrame(event_dict) 
edf.rename(columns={1: 'start', 2:'end'}, inplace=True)
edf

Unnamed: 0_level_0,ray:acquire_lock,ray:acquire_lock,ray:get_task,ray:get_task,ray:import_remote_function,ray:import_remote_function,ray:submit_task,ray:submit_task,ray:task,ray:task,ray:task:execute,ray:task:execute,ray:task:get_arguments,ray:task:get_arguments,ray:task:store_outputs,ray:task:store_outputs,ray:wait_for_function,ray:wait_for_function
Unnamed: 0_level_1,start,end,start,end,start,end,start,end,start,end,start,end,start,end,start,end,start,end
0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
1,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
2,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
3,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
4,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,1496885000.0,1496885000.0
5,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
6,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
7,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0
8,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,,,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0,1496885000.0


In [21]:
idf = pd.DataFrame(info_dict)
# idf.columns = pd.MultiIndex.from_tuples([(c, '') for c in idf]) # this is non-idempotent!
idf

Unnamed: 0,function_name,task_id,traceback,type,value
0,__main__.example,3296b98523df3487ed2f42ce6ccc6e649b13f175,,,
1,__main__.example,9a135aaaa66d21bf62c802fa120512f24478b227,,,
2,__init__,e124027d4dca1014dd25bafaa0945eafed4d9464,,,
3,to_go,1a985e2029f61fc87a27435b8a0b4af31cc72497,,,
4,error,b42caffedfbeda916ce70d1cb94a20f0ad0f2df3,"Traceback (most recent call last):\n File ""/U...",<class 'ZeroDivisionError'>,division by zero
5,__main__.example,5420a0c666de001163019018e32fe8eede869c3b,,,
6,__init__,b9f30131d06ae2e546d2f777169960d24c63712a,,,
7,__init__,8cd1a4f9cbda6a7d865c74141383232a67c86c8f,,,
8,__main__.example,6eb682c27d3d5366a3e4fe259052f5a149a32e8a,,,


In [61]:
remote_table = pd.concat([idf, edf], axis=1)

In [49]:
# Example to get Error, function_name, IP address, Actor ID, ParentID

errors = idf[idf.traceback.notnull()]
error_task = errors.merge(task_df, left_on="task_id", right_on="TaskSpec.TaskID")
err_task_loc = error_task.merge(client_df, left_on="LocalSchedulerID", right_on="DBClientID")
err_task_loc[['function_name', 'traceback', 'value', 'TaskSpec.ActorID',
              'TaskSpec.ParentTaskID',  'node_ip_address',  'task_id',]]

Unnamed: 0,function_name,traceback,value,TaskSpec.ActorID,TaskSpec.ParentTaskID,node_ip_address,task_id
0,error,"Traceback (most recent call last):\n File ""/U...",division by zero,727b7792eb60acf3f8a1361be0f54babb5419413,b15397c2c5d37a8454d18a6dcbb3a96081ce6dcf,127.0.0.1,b42caffedfbeda916ce70d1cb94a20f0ad0f2df3


# TODO:

- Error messages logging (specifics - which node, which function call, which actor, what time)
  - Getting Error messages from Redis is redundant because we already get info in the event_log. Error messages provide extra information such as `error_id` and `type`, which don't seem particularly useful.
  - ~Create table for workers (Id, socket info, node IP address) ... are workers even still a proper abstraction~
  - Get multinode setting - test out client table
  - !! Write out an example for error tracing