# CORD-19 subset selection for CoVeffect mining

## Setup

In [None]:
# Requirements (via pip)
# Whoosh==2.7.4
!pip install whoosh==2.7.4

In [None]:
# pandas
!pip install pandas

In [None]:
# Download metadata_clustered.csv from:
!wget https://polimi365-my.sharepoint.com/:x:/g/personal/10489381_polimi_it/ESCUy9vZnf1Fvvt-jxbS4wABEWUYzzlsEzpfKl0Vjj2KDA?download=1

## Load metadata and select cluster

cluster labels: 0, 1, 2, 3, 4

cluster with biochem papers: 1

In [1]:
import pandas as pd
from whoosh import index, writing
from whoosh.qparser import MultifieldParser, FuzzyTermPlugin, OperatorsPlugin, GroupPlugin, SingleQuotePlugin
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
pd.options.display.max_colwidth = 2000

In [2]:
df = pd.read_csv('./metadata_clustered.csv')

  df = pd.read_csv('./metadata_clustered.csv')


In [3]:
# (709906, 25)
df.shape

(709906, 25)

In [4]:
df_blue_cluster = df[df.labels == 1]

In [5]:
df_blue_cluster.shape

(135712, 25)

## Whoosh setup

In [None]:
# I assume the index directory is: ./whoosh_idx
!mkdir -p ./whoosh_idx

In [6]:
ANALYZER = RegexTokenizer() | LowercaseFilter() | StopFilter()
SCHEMA = Schema(
    cord_uid = ID(stored=True),
    doi = ID,
    title = TEXT(analyzer=ANALYZER),
    abstract = TEXT(analyzer=ANALYZER),
    authors = TEXT
)

### Create the whoosh index

This has to be done only the first time, skip to the next section otherwise

In [17]:
# Initialize an empty index
ix_working_dir = './whoosh_idx'
if index.exists_in(ix_working_dir):
    ix = index.open_dir(ix_working_dir)
    with ix.writer() as writer:
        writer.mergetype = writing.CLEAR

ix = index.create_in(ix_working_dir, SCHEMA)

In [18]:
# Index all the papers
with ix.writer() as writer:

    df_blue_cluster.fillna('').apply(lambda x: writer.add_document(cord_uid=x.cord_uid,
                                           doi=x.doi,
                                           title=x.title,
                                           abstract=x.abstract
            ) and writer.commit(optimize=True), axis=1)

### Load the index, if you have already computed it in the previous section

(you may skip these lines if you have just created the index in the above cells)

In [7]:
ix_working_dir = './whoosh_idx'
ix = index.open_dir(ix_working_dir)

## Subset selection

In [8]:
mparser = MultifieldParser(["title","abstract"], schema=SCHEMA)
mparser.add_plugin(OperatorsPlugin())
mparser.add_plugin(GroupPlugin())
mparser.add_plugin(SingleQuotePlugin())

In [167]:
queries = [
#"(SARS-COV-2 OR COVID-19) AND ((variant impact) OR (mutation impact) OR (mutation effect) OR (variant effect)) AND (variant of concern)"
#"(SARS-COV-2 OR COVID-19) AND (mutation impact OR variant impact OR mutation effect OR variant effect) AND variant of concern'"
#"(SARS-COV-2 OR COVID-19) AND (mutation (impact OR variant) (impact OR mutation) (effect OR variant) effect) AND (variant of concern)"
"variant of concern"
]


In [168]:
cord_uids = []

In [169]:
for _query in queries:
    query = mparser.parse(str(_query))
    with ix.searcher() as s:
        results = s.search(query, limit=100000)
        results = [elem['cord_uid'] for elem in results]
        cord_uids += results
#print(cord_uids)
cord_uids = list(set(cord_uids))

In [170]:
cord_uids = pd.DataFrame(cord_uids, columns=['cord_uid'])

In [171]:
df_results = pd.merge(df_blue_cluster, cord_uids, on=['cord_uid'], how='right')
df_results = df_results[df_results.doi.notna()].sort_values(by='publish_time', ascending=False).drop_duplicates(subset=['title'],keep='first')
df_results.shape

(830, 25)

In [146]:
df_results[['cord_uid', 'doi', 'title','abstract']].head(100)

Unnamed: 0,cord_uid,doi,title,abstract
138,d0chgdhd,10.1016/j.csbj.2022.04.030,SARS-CoV-2 Pan-variant Inhibitory Peptides Det...,Approved neutralizing antibodies that target t...
183,yppu5hp4,10.1093/infdis/jiac153,Protection of Hamsters Challenged with SARS-Co...,SARS-CoV-2 Variants of Concern (VoCs) negative...
212,o81ae6c2,10.1371/journal.pone.0266844,An in-silico study of the mutation-associated ...,"The emergence of Omicron (B.1.1.529), a new Va..."
93,5tab5282,10.3390/biom12040572,Mutational Effect of Some Major COVID-19 Varia...,COVID-19 is caused by severe acute respiratory...
170,6mg5g3jy,10.3390/cells11081262,SMYD2 Inhibition Downregulates TMPRSS2 and Dec...,The COVID-19 pandemic caused by SARS-CoV-2 has...
...,...,...,...,...
202,ztyin1xk,10.1101/2021.11.17.468942,IFITM dependency of SARS-CoV-2 variants of con...,We have recently shown that a SARS-CoV-2 strai...
112,vmffvpo6,10.1128/aac.00772-21,The Natural Stilbenoid (–)-Hopeaphenol Inhibit...,Antivirals are urgently needed to combat the g...
151,ycsua6po,10.1101/2021.11.14.21266294,Inactivated virus vaccine BBV152/Covaxin elici...,The characteristics of immune memory establish...
140,332ljbrr,10.3390/ijms222212114,"SARS-CoV-2 Variants, RBD Mutations, Binding Af...","Since 2020, the receptor-binding domain (RBD) ..."
