In [1]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd
from scipy.cluster.hierarchy import dendrogram
from Bio import Phylo
from Bio.Cluster import treecluster
from Bio.Phylo import Consensus
import os
import ujson
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

### Convert distance matrix in NumPy array to a Bio.Phylo.TreeConstruction.DistanceMatrix object

In [4]:
from Bio.Phylo.TreeConstruction import DistanceMatrix

def phylo_dmatrix_from_numpy(labels, distance_mat):
    dist_list = []
    for i in range(distance_mat.shape[0]):
        nested = []
        for j in range(i + 1):
            nested.append(distance_mat[i, j])
            pass

        dist_list.append(nested)
        pass

    phylo_dmatrix = DistanceMatrix(labels, matrix=dist_list)
    return phylo_dmatrix

### Distance Function

In [27]:
def d1(similarity: np.ndarray, r: float =2.0) -> np.ndarray:
    np.fill_diagonal(similarity, 0)
    ret = 1/np.power(r, similarity)
    np.fill_diagonal(ret, 0)
    return ret

### Obtain all the distance matrices from different datasets

In [9]:
data_dir = "../data/"
f_collab_distance = os.path.join(data_dir, "collaboration_distances.csv")
df_collab_distance = pd.read_csv(f_collab_distance)
faculties = list(df_collab_distance.columns)    # Obtain a list of facauty names
faculties.pop(0)
num_ppl = len(faculties)
pass

In [38]:
dmatrices = []    # List of distance matrices

In [39]:
# Joint Publication

f_joint_pub = os.path.join(data_dir, "joint_publication.json")
with open(f_joint_pub) as file:
    d_joint_pub = ujson.load(file)
    pass

sim_joint_pub = np.zeros((num_ppl, num_ppl))
i = 0
j = 1
for key, val in d_joint_pub.items():
    sim_joint_pub[i, j] = val
    j += 1
    if j == num_ppl:
        i += 1
        j = i + 1
        pass
    pass

sim_joint_pub += sim_joint_pub.T
dist_joint_pub = d1(sim_joint_pub)
dmatrices.append(dist_joint_pub)

In [40]:
f_joint_cite = os.path.join(data_dir, "citation_joint.json")
with open(f_joint_cite) as file:
    d_joint_cite = ujson.load(file)
    pass

sim_joint_cite = np.zeros((num_ppl, num_ppl))
i = 0
j = 1
for key, val in d_joint_cite.items():
    sim_joint_cite[i, j] = val
    j += 1
    if j == num_ppl:
        i += 1
        j = i + 1
        pass
    pass

sim_joint_cite += sim_joint_cite.T
dist_joint_cite = d1(sim_joint_cite)
dmatrices.append(dist_joint_cite)

In [41]:
f_dir_cite = os.path.join(data_dir, "citation_directed.json")
with open(f_dir_cite) as file:
    d_dir_cite = ujson.load(file)
    pass

sim_dir_cite = np.zeros((num_ppl, num_ppl))
i = 0
j = 0
for key, val in d_dir_cite.items():
    sim_dir_cite[i, j] = val
    
    j += 1
    if j == num_ppl:
        i += 1
        j = 0
        pass
    pass

sim_dir_cite += sim_dir_cite.T
dist_dir_cite = d1(sim_dir_cite)
dmatrices.append(dist_dir_cite)

In [42]:
f_com_journal = os.path.join(data_dir, "common_journals_.csv")
df_com_journals = pd.read_csv(f_com_journal)
sim_com_journals = df_com_journals[faculties].to_numpy()
dist_com_journals = d1(sim_com_journals)
dmatrices.append(dist_com_journals)

In [43]:
f_com_ref = os.path.join(data_dir, "common_references_.csv")
df_com_ref = pd.read_csv(f_com_ref)
sim_com_ref = df_com_ref[faculties].to_numpy()
dist_com_ref = d1(sim_com_ref)
dmatrices.append(dist_com_ref)

### Convert all the numpy distance matrices to phylo DistanceMatrix objects

In [45]:
phylo_dmatrices = []

for dmat in dmatrices:
    phylo_dmatrices.append(phylo_dmatrix_from_numpy(faculties, dmat))


### Construct phylo trees

In [47]:
constructor = DistanceTreeConstructor(distance_calculator=None, method='upgma')
phylo_trees = []

for dmat in phylo_dmatrices:
    phylo_trees.append(constructor.upgma(dmat))


In [50]:
consensus_tree = Consensus.majority_consensus(phylo_trees, cutoff=0.5)