In [1]:
import multiprocessing
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from itertools import repeat
from diff_match_patch import diff_match_patch

In [2]:
df = pd.read_csv("ncbi_sgene_good_unique.csv")

N = df.shape[0]     # distance matrix size
N = 100
M = np.arange(N)

accessions = df["accession"].tolist()
accessions = accessions[:N]

In [3]:
def levenshtein_metric(x, y):
    dmp = diff_match_patch()
    diffs = dmp.diff_main(df.iloc[int(x)]['sgene_nucleotide'],
                          df.iloc[int(y)]['sgene_nucleotide'])
    return dmp.diff_levenshtein(diffs)

In [4]:
def worker(index):
    if index % 10 == 0:
        # Tarpine informacija
        print(f"Current index {index}")

    d = list(map(levenshtein_metric, repeat(index), list(range(index))))
    d = np.pad(d, (0, N - len(d)))
    return d

In [5]:
nproc = multiprocessing.cpu_count()
# nproc = 4
print(f"Calculating distances. Using {nproc} CPUs")

with ProcessPoolExecutor(nproc) as pool:
    results = list(pool.map(worker, M))
    distances = np.stack(results)
distances += distances.T

df_dist = pd.DataFrame(distances, columns=accessions, index=accessions)
df_dist.to_csv("distances_lev.csv", index=False)

Calculating distances. Using 8 CPUs
Current index 0
Current index 10
Current index 20
Current index 30
Current index 40
Current index 50
Current index 60
Current index 70
Current index 80
Current index 90
