In [11]:
import numpy as np
from sklearn.cluster import SpectralClustering
import sklearn
import networkx
import pandas as pd
import matplotlib.pyplot as plt
import os
import ujson

In [26]:
def d1(similarity: np.ndarray, r: float =1.05) -> np.ndarray:
    np.fill_diagonal(similarity, 0)
    ret = 1/np.power(r, similarity)
    np.fill_diagonal(ret, 0)
    return ret

In [27]:
data_dir = "../data/"
f_collab_distance = os.path.join(data_dir, "collaboration_distances.csv")
df_collab_distance = pd.read_csv(f_collab_distance)
faculties = list(df_collab_distance.columns)    # Obtain a list of facauty names
faculties.pop(0)
num_ppl = len(faculties)
pass

In [62]:
simMat = []    # List of similarity matrices

In [63]:
# Joint Publication

f_joint_pub = os.path.join(data_dir, "joint_publication.json")
with open(f_joint_pub) as file:
    d_joint_pub = ujson.load(file)
    pass

sim_joint_pub = np.zeros((num_ppl, num_ppl))
i = 0
j = 1
for key, val in d_joint_pub.items():
    sim_joint_pub[i, j] = val
    j += 1
    if j == num_ppl:
        i += 1
        j = i + 1
        pass
    pass

sim_joint_pub += sim_joint_pub.T

simMat.append(sim_joint_pub)

In [64]:
# Directed Citation

f_dir_cite = os.path.join(data_dir, "citation_directed.json")
with open(f_dir_cite) as file:
    d_dir_cite = ujson.load(file)
    pass

sim_dir_cite = np.zeros((num_ppl, num_ppl))
i = 0
j = 0
for key, val in d_dir_cite.items():
    sim_dir_cite[i, j] = val
    
    j += 1
    if j == num_ppl:
        i += 1
        j = 0
        pass
    pass

sim_dir_cite += sim_dir_cite.T

simMat.append(sim_dir_cite)

In [65]:
weight_pub = 2
weight_cite = 1

weight_pub /= weight_pub + weight_cite
weight_cite /= weight_pub + weight_cite

sim_pub_cite = weight_pub * sim_joint_pub + weight_cite * sim_dir_cite


In [66]:
f_joint_cite = os.path.join(data_dir, "citation_joint.json")
with open(f_joint_cite) as file:
    d_joint_cite = ujson.load(file)
    pass

sim_joint_cite = np.zeros((num_ppl, num_ppl))
i = 0
j = 1
for key, val in d_joint_cite.items():
    sim_joint_cite[i, j] = val
    j += 1
    if j == num_ppl:
        i += 1
        j = i + 1
        pass
    pass

sim_joint_cite += sim_joint_cite.T
dist_joint_cite = d1(sim_joint_cite)
dmatrices.append(dist_joint_cite)

simMat.append(sim_joint_cite)

In [67]:
f_com_journal = os.path.join(data_dir, "common_journals_.csv")
df_com_journals = pd.read_csv(f_com_journal)
sim_com_journals = df_com_journals[faculties].to_numpy()
dist_com_journals = d1(sim_com_journals)
dmatrices.append(dist_com_journals)

simMat.append(sim_com_journals)

In [68]:
f_com_ref = os.path.join(data_dir, "common_references_.csv")
df_com_ref = pd.read_csv(f_com_ref)
sim_com_ref = df_com_ref[faculties].to_numpy()
dist_com_ref = d1(sim_com_ref)
dmatrices.append(dist_com_ref)

simMat.append(sim_com_ref)

## Spectral Clustering

In [69]:
def print_clusters(names, labels):
    
    num_cluster = labels.max() + 1
    clusters = [[] for i in range(num_cluster)]
    for i in range(labels.shape[0]):
        clusters[labels[i]].append(names[i])
    
    for i in range(num_cluster):
        print("- " * 24)
        for j in range(len(clusters[i])):
            print(clusters[i][j])
            pass
        pass
    print("- " * 24)
            

In [70]:
res = SpectralClustering(n_clusters=20, affinity="precomputed").fit(sim_joint_pub)
print_clusters(faculties, res.labels_)


- - - - - - - - - - - - - - - - - - - - - - - - 
Scott  Ahlgren
Gabriele La Nave
Partha Sarathi Dey
- - - - - - - - - - - - - - - - - - - - - - - - 
Jeremy Tyson
Steven  Bradlow
- - - - - - - - - - - - - - - - - - - - - - - - 
Pierre Albin
- - - - - - - - - - - - - - - - - - - - - - - - 
Igor G. Nikolaev
- - - - - - - - - - - - - - - - - - - - - - - - 
William  Haboush
- - - - - - - - - - - - - - - - - - - - - - - - 
Runhuan Feng
Xiaochen Jing
- - - - - - - - - - - - - - - - - - - - - - - - 
Alexander  Yong
- - - - - - - - - - - - - - - - - - - - - - - - 
Charles  Rezk
Matthew Ando
- - - - - - - - - - - - - - - - - - - - - - - - 
Daniel Berwick-Evans
- - - - - - - - - - - - - - - - - - - - - - - - 
Sheldon Katz H.
Renming Song
Randy McCarthy
Sankar P. Dutta
Richard B. Sowers
Eduard-Wilhelm Kirr
Ely Kerman
- - - - - - - - - - - - - - - - - - - - - - - - 
Anil  Hirani
Nathan M.  Dunfield
- - - - - - - - - - - - - - - - - - - - - - - - 
Aimo  Hinkkanen
- - - - - - - - - - - - - - - - - - 



In [92]:
simsum = np.zeros_like(simMat[0])

for sim in simMat:
    simsum += sim/sim.max()


In [93]:
res = SpectralClustering(n_clusters=20, affinity="precomputed").fit(simsum)
print_clusters(faculties, res.labels_)

- - - - - - - - - - - - - - - - - - - - - - - - 
Denka  Kutzarova
Timur Oikhberg
- - - - - - - - - - - - - - - - - - - - - - - - 
Rosemary Guzman
- - - - - - - - - - - - - - - - - - - - - - - - 
Nathan M.  Dunfield
Renming Song
Alexander  Yong
Vesna Stojanoska
William  Haboush
Jeremiah  Heller
- - - - - - - - - - - - - - - - - - - - - - - - 
Igor G. Nikolaev
- - - - - - - - - - - - - - - - - - - - - - - - 
Anil  Hirani
- - - - - - - - - - - - - - - - - - - - - - - - 
Igor Mineyev
- - - - - - - - - - - - - - - - - - - - - - - - 
Runhuan Feng
Xiaochen Jing
- - - - - - - - - - - - - - - - - - - - - - - - 
Vera Mikyoung Hur
Zoi Rapti
Lee DeVille
Jared  Bronski
Felix Leditzky
- - - - - - - - - - - - - - - - - - - - - - - - 
Richard B. Sowers
Bruce  Reznick
- - - - - - - - - - - - - - - - - - - - - - - - 
Charles  Rezk
Matthew Ando
- - - - - - - - - - - - - - - - - - - - - - - - 
Sankar P. Dutta
- - - - - - - - - - - - - - - - - - - - - - - - 
Partha Sarathi Dey
- - - - - - - - - - - - - - -