# CORD-19 subset selection for CoVeffect mining

## Setup

In [None]:
# Requirements (via pip)
# Whoosh==2.7.4
!pip install whoosh==2.7.4

In [None]:
# pandas
!pip install pandas

In [None]:
# Download metadata_clustered.csv from:
!wget https://polimi365-my.sharepoint.com/:x:/g/personal/10489381_polimi_it/ESCUy9vZnf1Fvvt-jxbS4wABEWUYzzlsEzpfKl0Vjj2KDA?download=1

## Load metadata and select cluster

cluster labels: 0, 1, 2, 3, 4

cluster with biochem papers: 1

In [2]:
import pandas as pd
from whoosh import index, writing
from whoosh.qparser import MultifieldParser, FuzzyTermPlugin, OperatorsPlugin
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

In [3]:
df = pd.read_csv('./metadata_clustered.csv')

  df = pd.read_csv('./metadata_clustered.csv')


In [6]:
# (709906, 25)
df.shape

(709906, 25)

In [5]:
df_blue_cluster = df[df.labels == 1]

## Whoosh setup

In [9]:
# I assume the index directory is: ./whoosh_idx
!mkdir -p ./whoosh_idx

In [10]:
ANALYZER = RegexTokenizer() | LowercaseFilter() | StopFilter()
SCHEMA = Schema(
    cord_uid = ID(stored=True),
    doi = ID,
    title = TEXT(analyzer=ANALYZER),
    abstract = TEXT(analyzer=ANALYZER),
    authors = TEXT
)

### Create the whoosh index

This has to be done only the first time, skip to the next section otherwise

In [17]:
# Initialize an empty index
ix_working_dir = './whoosh_idx'
if index.exists_in(ix_working_dir):
    ix = index.open_dir(ix_working_dir)
    with ix.writer() as writer:
        writer.mergetype = writing.CLEAR

ix = index.create_in(ix_working_dir, SCHEMA)

In [18]:
# Index all the papers
with ix.writer() as writer:

    df_blue_cluster.fillna('').apply(lambda x: writer.add_document(cord_uid=x.cord_uid,
                                           doi=x.doi,
                                           title=x.title,
                                           abstract=x.abstract
            ) and writer.commit(optimize=True), axis=1)

### Load the index, if you have already computed it in the previous section

(you may skip these lines if you have just created the index in the above cells)

In [4]:
ix_working_dir = './whoosh_idx'
ix = index.open_dir(ix_working_dir)

## Subset selection

In [19]:
mparser = MultifieldParser(["title","abstract"], schema=SCHEMA)
mparser.add_plugin(OperatorsPlugin())

In [20]:
queries = [
    'effect'
]


In [21]:
cord_uids = []

In [22]:
for _query in queries:
    query = mparser.parse(str(_query))
    with ix.searcher() as s:
        results = s.search(query, limit=100000)
        results = [elem['cord_uid'] for elem in results]
        cord_uids += results
#print(cord_uids)
cord_uids = list(set(cord_uids))

In [23]:
cord_uids = pd.DataFrame(cord_uids, columns=['cord_uid'])

In [34]:
df_results = pd.merge(df_blue_cluster, cord_uids, on=['cord_uid'], how='right')

In [37]:
df_results.shape

(16153, 25)