In [1]:
from Levenshtein import distance as levenshtein_distance
import pandas as pd
import numpy as np
from joblib import parallel_backend
from concurrent.futures import ProcessPoolExecutor


In [2]:
df = pd.read_csv("ncbi.csv")

In [3]:
X = df[['region', 'sgene_nucleotide']].drop_duplicates()
X.shape

(7326, 2)

In [4]:
def levenshtein_metric(x, y):
#     print(x, y)
    return levenshtein_distance(X.iloc[int(x)]['sgene_nucleotide'], 
                                X.iloc[int(y)]['sgene_nucleotide'])

In [5]:
nproc = 7
N = 100
M = np.arange(N)

In [6]:
vfunc = np.vectorize(levenshtein_metric)

def worker(indice):
    if indice == 0:
        return np.zeros(N)
    
    d = vfunc(indice, list(range(indice)))
    d = np.pad(d, (0, N - len(d)))
    return d

with ProcessPoolExecutor(nproc) as pool:
    results = list(pool.map(worker, M))
#     print(results)
    T = np.stack(np.concatenate(results))
    T = np.stack(results)

print(T)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [2. 2. 3. ... 0. 0. 0.]
 [2. 2. 3. ... 2. 0. 0.]
 [2. 2. 3. ... 2. 2. 0.]]


In [7]:
from itertools import repeat
def worker2(indice):
    d = list(map(levenshtein_metric, repeat(indice), list(range(indice))))
    d = np.pad(d, (0, N - len(d)))
    return d

with ProcessPoolExecutor(nproc) as pool:
    results = list(pool.map(worker2, M))
    T = np.stack(results)

print(T)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [2. 2. 3. ... 0. 0. 0.]
 [2. 2. 3. ... 2. 0. 0.]
 [2. 2. 3. ... 2. 2. 0.]]


In [8]:
from sklearn.metrics import pairwise_distances

In [9]:
X100 = np.arange(N).reshape(-1, 1)

In [10]:
pwd = pairwise_distances(X100, metric=levenshtein_metric)

In [11]:
pwd.astype(int)

array([[0, 0, 1, ..., 2, 2, 2],
       [0, 0, 1, ..., 2, 2, 2],
       [1, 1, 0, ..., 3, 3, 3],
       ...,
       [2, 2, 3, ..., 0, 2, 2],
       [2, 2, 3, ..., 2, 0, 2],
       [2, 2, 3, ..., 2, 2, 0]])