### Imports

In [1]:
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import ast

### Upload data with RoBERTa vectors

In [2]:
df = pd.read_csv('../data/metadata_with_title_roberta_vectors.csv')
articles_biomed_roberta_emb = df['roberta_title_vectors'].values

articles_biomed_roberta_emb = [ast.literal_eval(a) for a in articles_biomed_roberta_emb]

### Clustering with K-means

In [3]:
kmeans = KMeans(n_clusters=20, random_state=42).fit(articles_biomed_roberta_emb)

In [4]:
df['kmean_cluster'] = kmeans.labels_

In [5]:
list(set(kmeans.labels_[-10:]))

[1, 4, 12, 15, 17, 19]

### Clustering with 25-NN

In [6]:
nbrs_2 = NearestNeighbors(n_neighbors=25, algorithm='ball_tree').fit(articles_biomed_roberta_emb)

In [7]:
distances_2, indices_2 = nbrs_2.kneighbors(articles_biomed_roberta_emb[-10:])

In [8]:
pca = PCA(n_components=200)
principal_components_roberta = pca.fit_transform(articles_biomed_roberta_emb)

In [9]:
tsne_embedded_roberta = TSNE(n_components=2).fit_transform(principal_components_roberta)

In [10]:
clusters_temp = [[]]*len(df)
for i in range(len(indices_2)):
    for j in range(len(indices_2[i])):
        temp = clusters_temp[indices_2[i][j]]
        clusters_temp[indices_2[i][j]] = temp + [i]

In [11]:
clusters = []
for c in clusters_temp:
    if len(c) == 0:
        clusters.append([-1])
    else:
        clusters.append(c)

In [12]:
df['clusters'] = clusters

### Clustering Intersection

In [13]:
df['clusters'][-10:]

13518                [0]
13519                [1]
13520             [2, 3]
13521          [3, 8, 9]
13522                [4]
13523       [3, 5, 7, 8]
13524                [6]
13525          [0, 5, 7]
13526    [0, 3, 5, 8, 9]
13527                [9]
Name: clusters, dtype: object

In [14]:
df['kmean_cluster'][-10:]

13518    12
13519     1
13520     4
13521    15
13522    17
13523    15
13524    19
13525    12
13526    15
13527    12
Name: kmean_cluster, dtype: int32

In [15]:
cluster_vote_temp = [[]]*len(df)

for i in range(len(df)):
    for j in range(10):
        if df['kmean_cluster'][i] == kmeans.labels_[-10:][j] and j in df['clusters'][i]:
            cluster_vote_temp[i] = cluster_vote_temp[i] + [j]
            

In [16]:
cluster_vote = []
for c in cluster_vote_temp:
    if len(c) == 0:
        cluster_vote.append([-1])
    else:
        cluster_vote.append(c)

In [17]:
df['cluster_vote'] = cluster_vote

df['tsne_0'] = tsne_embedded_roberta[:,0]
df['tsne_1'] = tsne_embedded_roberta[:,1]

In [18]:
df['cluster_vote'][-10:]

13518          [0]
13519          [1]
13520          [2]
13521       [3, 8]
13522          [4]
13523    [3, 5, 8]
13524          [6]
13525       [0, 7]
13526    [3, 5, 8]
13527          [9]
Name: cluster_vote, dtype: object

In [19]:
# Saving results for insights

df.to_csv('./results/cluster_results.csv')

### Bokeh Vizualization

In [20]:
source = ColumnDataSource(df)

In [21]:
COLORS_NN_GROUPS = ['lightgrey', 'blue', 'red', 'black', 'green', 'violet', 
                    'pink', 'turquoise', 'gold', 'sienna', 'orange']

In [22]:
colors = [COLORS_NN_GROUPS[df['cluster_vote'][i][0]+1] for i in range(len(df))]

In [23]:
len(colors)

13528

In [24]:
p = figure(width=800, height=600)
p.circle(x='tsne_1', y='tsne_0',
         source=source,
         size=5, color=colors)

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)


In [25]:
p.title.text = 'COVID Scholars clustering representation'

In [26]:
hover = HoverTool()
hover.tooltips=[
    ('title', '@title'),
]

p.add_tools(hover)

show(p)