In [1]:
import os
import re
import json
import utils
import random
import gensim
import warnings

import numpy as np
import pandas as pd

from tasks import *
from pprint import pprint
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from yellowbrick.cluster import KElbowVisualizer

warnings.filterwarnings('ignore')



In [3]:
complete_df = pd.read_csv("data/clean_df.csv")
complete_df.shape

(59561, 10)

In [4]:
complete_df.head(2)

Unnamed: 0.1,Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0,6f005a0677f24f697c1ec3ce065ec4144519e357,Antiviral Mechanisms of Human Defensins Introd...,"Sarah S Wilson, Mayim E Wiens, Jason G Smith","Sarah S Wilson (University of Washington, 1705...",Defensins are an effector component of the inn...,Defensins are one of the most abundant classes...,Direct inactivation of viruses by human granul...,"[{'first': 'Sarah', 'middle': ['S'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Direct ..."
1,1,4994fa72322bbf19120592304d92629226948d8e,Rapid Identification of Malaria Vaccine Candid...,"V Villard, G W Agak, G Frank, A Jafarshad, C S...","V Villard, G W Agak, G Frank, A Jafarshad, C S...",To identify malaria antigens for vaccine devel...,Human Plasmodium falciparum (Pf) infection is ...,Identification of vaccine candidates against s...,"[{'first': 'V', 'middle': [], 'last': 'Villard...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Identif..."


In [5]:
complete_df.describe()

Unnamed: 0.1,Unnamed: 0
count,59561.0
mean,29780.0
std,17193.924029
min,0.0
25%,14890.0
50%,29780.0
75%,44670.0
max,59560.0


In [7]:
complete_df.dropna(inplace = True)
complete_df.describe()

Unnamed: 0.1,Unnamed: 0
count,40152.0
mean,29705.053248
std,17183.494309
min,0.0
25%,14787.75
50%,29717.0
75%,44551.25
max,59560.0


In [None]:
complete_df.to_csv("data/clean_df.csv")

In [None]:
# Keep only texts with minimal number of words 

complete_df = complete_df[complete_df['text'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 1000]                                         
complete_df.shape

In [None]:
# See random text

complete_df.reset_index(inplace = True, drop = True)
complete_df.iloc[42]['abstract'][:500]

In [None]:
frac_of_articles = 1
train_df  = complete_df.sample(frac = frac_of_articles, random_state = 42)
train_corpus = (list(utils.read_corpus(train_df, 'abstract'))) 

In [None]:
# Using distributed memory model

model = gensim.models.doc2vec.Doc2Vec(dm = 1, vector_size = 50, min_count = 10, dm_mean = 1, epochs = 20, seed = 42, workers = 6)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

In [None]:
list_of_tasks = [task_1, task_2, task_3, task_4, task_5, task_6, task_7, task_8, task_9]

In [None]:
abstract_vectors = model.docvecs.vectors_docs
array_of_tasks = [utils.get_doc_vector(task, model) for task in list_of_tasks]

In [None]:
train_df['abstract_vector'] = [vec for vec in abstract_vectors]

### Nearest Neigbors search

In [None]:
train_df = train_df[train_df['abstract'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 40].drop_na()
train_df.shape

In [None]:
train_array = train_df['abstract_vector'].values.tolist()

In [None]:
ball_tree = NearestNeighbors(algorithm = 'ball_tree', leaf_size = 20).fit(train_array)

In [None]:
# Query for all tasks

distances, indices = ball_tree.kneighbors(array_of_tasks, n_neighbors = 3)

In [None]:
for i, info in enumerate(list_of_tasks):
    print("="*80, f"\n\nTask = {info[:100]}\n", )
    df =  train_df.iloc[indices[i]]
    abstracts = df['abstract']
    titles = df['title']
    dist = distances[i]
    for l in range(len(dist)):
        print(f" Text index = {indices[i][l]} \n Distance = {distances[i][l]} \n Title: {titles.iloc[l]} \n Abstract extract: {abstracts.iloc[l][:200]}\n\n")

### Clustering and visualisation

In [None]:
abstract_vectors = model.docvecs.vectors_docs
kmeans = KMeans(init = 'k-means++', max_iter = 300, random_state = 42) 
visualizer = KElbowVisualizer(kmeans, k = (2, 16))
visualizer.fit(abstract_vectors)
visualizer.show()

In [None]:
# Clearly 7 clusters are here

In [None]:
kmeans = KMeans(n_clusters = 7, init = 'k-means++', max_iter = 100, random_state = 42) 
train_df['labels'] = kmeans.fit_predict(train_array)

In [None]:
train_df[['text','labels']][:20]

### Save Binaries

In [None]:
model.save('models/CORD-doc2vec')