Import libraries

In [1]:
import os
import re
import ast
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics.pairwise import cosine_similarity

from numba import jit # parallel processing

Define i/o paths

In [2]:
input_data_path = '/kaggle/input/'
input_data_file = 'filt_merged_text_vector_df_200430.csv'

Read and process data

In [3]:
input_data = pd.read_csv(input_data_path+input_data_file)

In [4]:
#Keep only sentences containing at least 3 words other than those defined below
#This also removes any sentences that do not contain any words at all
rep = {"text": "", "cite_spans": "", "ref_spans": "", "section": "", "Abstract": "",\
       "bioRxiv preprint": "", "medRxiv preprint": "", "doi:": ""}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
sentences_temp = [pattern.sub(lambda m: rep[re.escape(m.group(0))], s) for s in input_data.sentence]
pattern = re.compile(".*[A-Za-z].*")
sentences_to_keep = [(bool(re.search(pattern,s))) & (len(s.split(' '))>2) for s in sentences_temp]
input_processed = input_data.loc[sentences_to_keep,:]
sentences_to_drop = [not i for i in sentences_to_keep]
input_excluded = input_data.loc[sentences_to_drop,:]

In [5]:
#Convert w2vVector column from string type to  list
input_processed.w2vVector = [re.sub(',+', ',', ','.join(w.replace('\n','').split(' '))) for w in input_processed.w2vVector]
input_processed.w2vVector = [re.sub('\[,', '', w) for w in input_processed.w2vVector]
input_processed.w2vVector = [re.sub(',\]', '', w) for w in input_processed.w2vVector]
input_processed.w2vVector = [re.sub('\[', '', w) for w in input_processed.w2vVector]
input_processed.w2vVector = [re.sub('\]', '', w) for w in input_processed.w2vVector]
input_processed.w2vVector = input_processed.w2vVector.apply(lambda s: list(ast.literal_eval(s)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [6]:
input_processed.to_csv('cord_titles_abstracts_conclusions.csv')
input_excluded.to_csv('cord_titles_abstracts_conclusions_excluded.csv')

In [7]:
input_processed

Unnamed: 0,cord_uid,sentence_id,section,sentence,w2vVector
0,cc29v5qw,cc29v5qw029827,title,Patients with mental health disorders in the C...,"[-0.14722456, 0.01686829, 0.16839595, -0.04342..."
2,oribgtyl,oribgtyl031852,title,Prisons and custodial settings are part of a c...,"[-0.0138100795, 0.00680788979, 0.102320798, 0...."
4,6r27qzap,6r27qzap031865,title,The resilience of the Spanish health system ag...,"[0.00911541, -0.02315879, 0.11453069, -0.05539..."
6,65b267ic,65b267ic032159,title,Six weeks into the 2019 coronavirus disease (C...,"[-0.02097336, 0.02406202, 0.06325101, -0.14176..."
8,cja8i0hw,cja8i0hw032165,title,The Novel Coronavirus: A Bird's Eye View,"[0.107559, -0.14134592, -0.0632042, -0.0697434..."
...,...,...,...,...,...
73032,647zcjgu,647zcjgu1219583,Discussion,"Among them are captopril, perindopril, ramipri...","[0.11334185, 0.11791535, 0.11523059, 0.0925779..."
73033,647zcjgu,647zcjgu1219584,Discussion,"Although these drugs primarily target ACE, a h...","[0.05551118, -0.01351563, 0.18281737, -0.04538..."
73034,647zcjgu,647zcjgu1219585,Discussion,It should be noted that ACE inhibitors bind to...,"[0.04974999, -0.06304489, 0.10167421, 0.020741..."
73035,647zcjgu,647zcjgu1219586,Discussion,"Nonetheless, these enzymatic inhibitors may in...","[0.00876493, -0.06595846, 0.09507835, -0.01413..."


In [8]:
title_data = input_processed.loc[input_processed.section=='title',:]
abstract_data = input_processed.loc[input_processed.section=='abstract',:]
conclusion_data = input_processed.loc[(input_processed.section!='title') & (input_processed.section!='abstract'),:]

In [9]:
print('Number of papers:', input_data.cord_uid.nunique())
print('Number of papers with title:', title_data.cord_uid.nunique())
print('Number of papers with abstract:', abstract_data.cord_uid.nunique())
print('Number of papers with conclusion:', conclusion_data.cord_uid.nunique())

Number of papers: 4677
Number of papers with title: 4663
Number of papers with abstract: 2992
Number of papers with conclusion: 1662


In [10]:
print('Number of unique sentences under titles:', title_data.sentence.nunique())
print('Number of unique sentence ids under titles:', title_data.sentence_id.nunique())

Number of unique sentences under titles: 4819
Number of unique sentence ids under titles: 4903


Calculate cosine similarity between titles

In [11]:
#Average w2v vectors of all sentences falling under a single cord_uid
title_data_final = pd.DataFrame(columns = ['cord_uid','sentence','w2vVector'])
for cord_uid in title_data.cord_uid.unique():
    title = " ".join(title_data.loc[title_data.cord_uid==cord_uid,'sentence'])
    w2vVector = np.mean(list(title_data.loc[title_data.cord_uid==cord_uid,'w2vVector']), axis=0)
    title_data_final = title_data_final.append({'cord_uid':cord_uid,\
                                                'sentence': title,\
                                                'w2vVector': w2vVector},\
                                               ignore_index=True)

In [12]:
len(title_data_final)

4663

In [36]:
pattern = re.compile(".*hydroxychloroquine.*")
sentences_to_keep = [(bool(re.search(pattern,s.lower()))) for s in title_data_final.sentence]
drug_title_data = title_data_final.loc[sentences_to_keep,:].reset_index(drop=True)
len(drug_title_data)

31

In [37]:
title_similarity = pd.DataFrame(columns=['paper1_cord_uid','paper2_cord_uid','title1','title2','similarity_score'])
jit(nopython=True, parallel=True)
for i,paper1 in enumerate(drug_title_data.sentence):
    for j,paper2 in enumerate(drug_title_data.sentence):
        if i!=j:
            cos_sim = cosine_similarity(drug_title_data.w2vVector[i].reshape(1,-1),drug_title_data.w2vVector[j].reshape(1,-1))[0][0]
            title_similarity = title_similarity.append({'paper1_cord_uid':drug_title_data.cord_uid[i],\
                                                        'paper2_cord_uid':drug_title_data.cord_uid[j],\
                                                        'title1':paper1,\
                                                        'title2':paper2,\
                                                        'similarity_score':cos_sim},\
                                               ignore_index=True)

In [26]:
# title_similarity = pd.DataFrame(columns=['paper1_cord_uid','paper2_cord_uid','title1','title2','similarity_score'])
# jit(nopython=True, parallel=True)
# for i,paper1 in enumerate(title_data_final.sentence):
#     for j,paper2 in enumerate(title_data_final.sentence):
#         if i!=j:
#             cos_sim = cosine_similarity(title_data_final.w2vVector[i].reshape(1,-1),title_data_final.w2vVector[j].reshape(1,-1))[0][0]
#             title_similarity = title_similarity.append({'paper1_cord_uid':title_data_final.cord_uid[i],\
#                                                         'paper2_cord_uid':title_data_final.cord_uid[j],\
#                                                         'title1':paper1,\
#                                                         'title2':paper2,\
#                                                         'similarity_score':cos_sim},\
#                                                ignore_index=True)

KeyboardInterrupt: 

In [15]:
title_similarity.to_csv('title_similarity_sample.csv')