<a href="https://colab.research.google.com/github/DayalStrub/ecir2021tutorial/blob/main/other/2-simplest-thing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install -q python-terrier
!pip install -q git+https://github.com/terrier-org/pyterrier.git

  Building wheel for python-terrier (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize

In [3]:
import pyterrier as pt
pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

  from pandas import Panel


PyTerrier 0.5.0 has loaded Terrier 5.4 (built by craigm on 2021-01-16 14:17)


In [4]:
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')

In [5]:
next(iter(dataset.get_corpus_iter()))

HBox(children=(FloatProgress(value=0.0, description='cord19/trec-covid documents', max=192509.0, style=Progres…

OrderedDict([('title',
              'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia'),
             ('doi', '10.1186/1471-2334-1-6'),
             ('date', '2001-07-04'),
             ('abstract',
              'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common

In [6]:
N = 10_000

pt_index_path = './index_cord19'

# create the index, using the IterDictIndexer indexer 
indexer = pt.index.IterDictIndexer(pt_index_path, overwrite=True) # , blocks=True

# we give the dataset get_corpus_iter() directly to the indexer
# while specifying the fields to index and the metadata to record
index_ref = indexer.index(
    (datum for i, datum in enumerate(dataset.get_corpus_iter()) if i < N),
    fields=('abstract',), # TODO should this be a list?
    # meta={'docno' : 26, 'text' : 2048, 'abstract' : 2048}
    meta=['docno','abstract'],
    )

index = pt.IndexFactory.of(index_ref)

HBox(children=(FloatProgress(value=0.0, description='cord19/trec-covid documents', max=192509.0, style=Progres…



07:37:22.572 [ForkJoinPool-1-worker-3] WARN  o.t.structures.indexing.Indexer - Indexed 2143 empty documents


In [7]:
print(index.getCollectionStatistics().toString())

Number of documents: 10000
Number of terms: 36191
Number of postings: 673758
Number of fields: 1
Number of tokens: 1040094
Field names: [abstract]
Positions:   false



In [8]:
queries = dataset.get_topics(variant='title').head(3)
queries.head()

Unnamed: 0,qid,query
0,1,coronavirus origin
1,2,coronavirus response to weather changes
2,3,coronavirus immunity


In [9]:
M = 1002 # TODO is 1000 the max ever returned? Probably bug

bm25 = pt.BatchRetrieve(index, wmodel="BM25", num_results=M)
rm3 = pt.rewrite.RM3(index)
bm25_text = pt.BatchRetrieve(index, wmodel="BM25", metadata=["docno", "abstract"], num_results=M)

rm3_pipe = bm25 >> rm3 >> bm25_text >> pt.apply.rename({'abstract':'text'})

In [10]:
# TODO Question: in rm3_pipe, is final bm25 just reranking initial bm25 results, or does it re-consider all docs?

In [11]:
df_out = rm3_pipe.transform(queries)
df_out # .loc[df_out["qid"] == "1", :]

Unnamed: 0,qid,docid,docno,text,rank,score,query_0,query
0,1,2257,8y24j34j,Middle East respiratory syndrome coronavirus i...,0,19.049835,coronavirus origin,applypipeline:off protect^0.035425290 anim^0.0...
1,1,4582,2jq626ye,A novel coronavirus (2019-nCoV) originating in...,1,13.717366,coronavirus origin,applypipeline:off protect^0.035425290 anim^0.0...
2,1,1394,vnafx1ng,The phosphoprotein (P) gene of most Paramyxovi...,2,11.114468,coronavirus origin,applypipeline:off protect^0.035425290 anim^0.0...
3,1,1616,hp5x637c,"BACKGROUND: In 2007, a novel bunyavirus was fo...",3,8.417806,coronavirus origin,applypipeline:off protect^0.035425290 anim^0.0...
4,1,4936,41ui4lqc,Emerging viruses represent a continuous threat...,4,8.340423,coronavirus origin,applypipeline:off protect^0.035425290 anim^0.0...
...,...,...,...,...,...,...,...,...
2995,3,4382,r2s0gks8,"BACKGROUND: In endemic areas, pregnant women a...",995,2.851963,coronavirus immunity,applypipeline:off network^0.029360194 step^0.0...
2996,3,673,x5ardo5j,"In all vertebrate animals, CD8(+) cytotoxic T ...",996,2.851936,coronavirus immunity,applypipeline:off network^0.029360194 step^0.0...
2997,3,1567,y1nr8y25,OBJECTIVES: The emergence of the pandemic infl...,997,2.849012,coronavirus immunity,applypipeline:off network^0.029360194 step^0.0...
2998,3,4147,mrfvdpmk,As a response to a diverse array of external s...,998,2.848665,coronavirus immunity,applypipeline:off network^0.029360194 step^0.0...


In [12]:
df_out.loc[:, ["qid", "docid"]].groupby("qid").count()

Unnamed: 0_level_0,docid
qid,Unnamed: 1_level_1
1,1000
2,1000
3,1000


In [13]:
def pr_linear_reranker(
    df,
    *,
    n_top = 10,
    n_bottom = 100, # TODO should actually get 0 score documents? how to do this in pyterrier?
    col_text = 'text'
):
  df_tmp = df # .copy()
  df_relevant = df_tmp.sort_values("rank").reset_index(drop=True).loc[0:n_top, ["docid", col_text]]
  df_relevant["label"] = 1

  df_not_relevant = df_tmp.sort_values("rank", ascending=False).reset_index(drop=True).loc[0:n_bottom, ["docid", col_text]]
  df_not_relevant["label"] = 0

  df_train = pd.concat([df_relevant, df_not_relevant])

  text_transformer = Pipeline([
      ('bow', CountVectorizer()),
      ('tfidf', TfidfTransformer()),
  ])

  preprocessor = ColumnTransformer(
      transformers=[
          # ('num', num_transformer, col_num),
          # ('cat', cat_transformer, col_cat),
          ('text', text_transformer, col_text)
      ]
  )

  crf = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('classifier', LogisticRegression())
  ])

  X_train = df_train.loc[:, ["docid", col_text]]
  y_train = df_train["label"]

  crf.fit(X_train, y_train);

  preds = crf.predict_proba(df_tmp.loc[:, ["docid", col_text]])

  df_tmp["score"] = preds[:, 1] # True class second?
  df_tmp.sort_values("score", ascending=False, inplace=True)
  df_tmp.reset_index(drop=True, inplace=True)
  df_tmp["rank"] = df_tmp.index
  return df_tmp

In [14]:
def normalise_score(df, norm="max"):
  normalized_data = normalize(df["score"].to_numpy().reshape(1, -1), norm=norm)[0]
  df["score"] = normalized_data
  return df

In [15]:
df_eg = df_out.copy().loc[df_out["qid"] == "2", :].reset_index(drop=True)

pr_linear_reranker(df_eg)

Unnamed: 0,qid,docid,docno,text,rank,score,query_0,query
0,2,2773,r9scxa76,Infectious diseases attributable to unsafe wat...,0,0.254186,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
1,2,1099,exqza1kg,Global climate change is expected to affect th...,1,0.237984,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
2,2,5340,js33q9lp,"Humans, animals and plants suffer from similar...",2,0.221248,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
3,2,3258,qm8kalyt,Bangladesh is one of the worlds most vulnerabl...,3,0.213264,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
4,2,326,gbdaad4l,BACKGROUND: Streptococcus pneumoniae is a comm...,4,0.202487,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
...,...,...,...,...,...,...,...,...
995,2,780,vtp4cwvt,Alveolar macrophages (AM) are one of the key c...,995,0.073564,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
996,2,343,094d0rn6,The innate immune response is essential for co...,996,0.072940,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
997,2,2529,o4o4bzna,Avian leucosis virus subgroup J (ALV-J) can ca...,997,0.072854,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...
998,2,1544,c1uom5f7,Airway epithelial cells are the first line of ...,998,0.072816,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...


In [17]:
pr_pipe = rm3_pipe >> pt.apply.by_query(pr_linear_reranker) >> pt.apply.by_query(normalise_score)

df_out_2 = pr_pipe.transform(queries)
df_out_2.loc[df_out_2["qid"] == "2", :]

Unnamed: 0,qid,docid,docno,text,score,query_0,query,rank
0,2,2773,r9scxa76,Infectious diseases attributable to unsafe wat...,1.000000,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,0
1,2,1099,exqza1kg,Global climate change is expected to affect th...,0.936262,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,1
2,2,5340,js33q9lp,"Humans, animals and plants suffer from similar...",0.870420,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,2
3,2,3258,qm8kalyt,Bangladesh is one of the worlds most vulnerabl...,0.839010,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,3
4,2,326,gbdaad4l,BACKGROUND: Streptococcus pneumoniae is a comm...,0.796611,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,4
...,...,...,...,...,...,...,...,...
995,2,780,vtp4cwvt,Alveolar macrophages (AM) are one of the key c...,0.289412,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,995
996,2,343,094d0rn6,The innate immune response is essential for co...,0.286954,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,996
997,2,2529,o4o4bzna,Avian leucosis virus subgroup J (ALV-J) can ca...,0.286617,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,997
998,2,1544,c1uom5f7,Airway epithelial cells are the first line of ...,0.286468,coronavirus response to weather changes,applypipeline:off respons^0.150000006 forecast...,998


In [18]:
# TODO how to run rm3_pipe only once? CHECK caching

alpha = 0.5

pipe = alpha * rm3_pipe + (1 - alpha) * pr_pipe

In [19]:
# NOTE Experiment is indeed/actually a function
pt.Experiment(
    retr_systems=[bm25, rm3_pipe, pr_pipe, pipe],
    names=["BM25", "BM25-RM3-BM25", "BM25-RM3-BM25-PRlog", "simplest"],
    topics=queries,
    qrels=dataset.get_qrels(), # TODO figure out cord19 qrels
    eval_metrics=["map"]
              )

[INFO] If you have a local copy of https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/8138424a59daea0aba751c8a891e5f54
[INFO] [starting] https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt
[INFO] [finished] https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt: [00:00] [1.14MB] [6.06MB/s]


Unnamed: 0,name,map
0,BM25,0.000389
1,BM25-RM3-BM25,0.000386
2,BM25-RM3-BM25-PRlog,0.000928
3,simplest,0.00039
