In [1]:
import sys
from scipy.spatial.distance import hamming

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
%matplotlib inline

import os, pickle, re, glob, time
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from scipy import stats
from collections import Counter

sns.set_style('ticks')
pd.set_option('precision', 2)
#np.set_printoptions('precision', 2)

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
args = pickle.load(open('arguments.pickle', 'rb'))

# Load data

In [3]:
data = pickle.load(open(os.path.join(args.preprocess_dir, "data_X_df.pickle"), "rb"))

In [4]:
data

array([['C', 'T', 'C', ..., 'T', 'C', 'C'],
       ['C', 'T', 'C', ..., 'T', 'C', 'C'],
       ['C', 'T', 'C', ..., 'T', 'C', 'C'],
       ...,
       ['T', 'G', 'T', ..., 'C', 'C', 'T'],
       ['C', 'T', 'C', ..., 'T', 'C', 'C'],
       ['T', 'G', 'T', ..., 'C', 'C', 'T']], dtype=object)

In [5]:
data.shape

(3406, 54)

In [6]:
n_sample = data.shape[0]
n_sample

3406

# Create methods

In [7]:
import ray
num_cpus = os.cpu_count()

ray.init(num_cpus=num_cpus, ignore_reinit_error=True)

2021-09-11 15:29:30,663	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.18.6',
 'raylet_ip_address': '192.168.18.6',
 'redis_address': '192.168.18.6:6379',
 'object_store_address': '/tmp/ray/session_2021-09-11_15-29-30_201134_6509/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-09-11_15-29-30_201134_6509/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-09-11_15-29-30_201134_6509',
 'metrics_export_port': 54923,
 'node_id': '6beef68fff25a216aaf854a687167b2cd4a533ad'}

In [8]:
@ray.remote
class FeatureMatrix(object):
    def __init__(self, X):
        self.X = X
    def get_X(self):
        return self.X
    
    @ray.method(num_returns=1)
    def cal_dist(self, i):
        return np.array([hamming(self.X[i], x) for x in self.X[i:]], dtype=float)

In [9]:
FeatureMatrix.options(name="FeatureMatrix", lifetime="detached").remote(data)

Actor(FeatureMatrix,df5a1a8201000000)

In [10]:
X_actor = ray.get_actor("FeatureMatrix")
print (X_actor)
X_dat = ray.get(X_actor.get_X.remote())
print (X_dat.shape)

Actor(FeatureMatrix,df5a1a8201000000)
(3406, 54)


# Create Ray parallel actors = num_cpus

In [11]:
X_actor_list = []
for i in range(num_cpus):
    FeatureMatrix.options(name="FeatureMatrix_{}".format(i), lifetime="detached").remote(data)
    X_actor = ray.get_actor("FeatureMatrix_{}".format(i))
    X_actor_list += [X_actor]

In [12]:
X_actor_list

[Actor(FeatureMatrix,7bbd902801000000),
 Actor(FeatureMatrix,bd37d26201000000),
 Actor(FeatureMatrix,88866c7d01000000),
 Actor(FeatureMatrix,d251967801000000),
 Actor(FeatureMatrix,3bf0c85601000000),
 Actor(FeatureMatrix,72e11b4601000000),
 Actor(FeatureMatrix,62223d8501000000),
 Actor(FeatureMatrix,3106d80c01000000),
 Actor(FeatureMatrix,ae935fc001000000),
 Actor(FeatureMatrix,3db7cfef01000000),
 Actor(FeatureMatrix,a628090a01000000),
 Actor(FeatureMatrix,fafba2ba01000000),
 Actor(FeatureMatrix,b7603b6c01000000),
 Actor(FeatureMatrix,a491754501000000),
 Actor(FeatureMatrix,84b65a9401000000),
 Actor(FeatureMatrix,87b4f72601000000)]

In [13]:
ray.get(X_actor_list[0].cal_dist.remote(n_sample-1))

array([0.])

# List and execute tasks

In [14]:
todo_tasks = [X_actor_list[i%num_cpus].cal_dist.remote(i) for i in range(n_sample)]
len(todo_tasks)

3406

In [15]:
out = ray.get(todo_tasks)

# Create distance matrix

In [16]:
dist_mat = np.zeros((n_sample, n_sample))
iu = np.triu_indices(n_sample)
# il = np.tril_indices(n_sample)

In [17]:
dist_mat[iu] = np.hstack(out)

In [18]:
dist_mat

array([[0.  , 0.  , 0.02, ..., 0.7 , 0.17, 0.72],
       [0.  , 0.  , 0.02, ..., 0.7 , 0.17, 0.72],
       [0.  , 0.  , 0.  , ..., 0.72, 0.19, 0.74],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.69, 0.07],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.7 ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [19]:
dist_mat = dist_mat + dist_mat.T
dist_mat.shape

(3406, 3406)

In [20]:
dist_mat

array([[0.  , 0.  , 0.02, ..., 0.7 , 0.17, 0.72],
       [0.  , 0.  , 0.02, ..., 0.7 , 0.17, 0.72],
       [0.02, 0.02, 0.  , ..., 0.72, 0.19, 0.74],
       ...,
       [0.7 , 0.7 , 0.72, ..., 0.  , 0.69, 0.07],
       [0.17, 0.17, 0.19, ..., 0.69, 0.  , 0.7 ],
       [0.72, 0.72, 0.74, ..., 0.07, 0.7 , 0.  ]])

In [22]:
pickle.dump(dist_mat, open(os.path.join(args.preprocess_dir, 'dist_mat_X_df.pickle'), 'wb'), protocol=4)


# Shutdown Ray

In [23]:
ray.shutdown()

```bash
jupyter nbconvert --ExecutePreprocessor.timeout=-1 --to notebook --execute "sandbox_pairwise_distance.ipynb"
```