In [None]:
!pip install spacy
!pip install python-terrier
!python -m spacy download en_core_web_sm

In [37]:
import pandas as pd
#to display the full text on the notebook without truncation
pd.set_option('display.max_colwidth', 150)
from adarank import AdaRank
from metrics import NDCGScorer
import spacy
import re
from collections import defaultdict



In [38]:
import pyterrier as pt
if not pt.started():
    pt.init()

In [39]:
queries ={
    1: "GLUCOSE IN BLOOD",
    2: "BILIRUBIN IN PLASMA",
    3: "WHITE BLOOD CELLS COUNT"
    
}

In [40]:

# For the query we allocated to every query and id

queries_df = pd.DataFrame([queries]).T
queries_df.columns = ['query']
queries_df["qid"] = (pd.factorize(queries_df['query'])[0]+1).astype(str)

In [41]:
queries_df

Unnamed: 0,query,qid
1,GLUCOSE IN BLOOD,1
2,BILIRUBIN IN PLASMA,2
3,WHITE BLOOD CELLS COUNT,3


In [42]:
a =pd.read_excel("loinc_dataset-v2.xlsx",sheet_name = 0, header=2)
b =pd.read_excel("loinc_dataset-v2.xlsx",sheet_name = 1, header=2)
c =pd.read_excel("loinc_dataset-v2.xlsx",sheet_name = 2, header=2)


In [43]:
documents = pd.merge(pd.merge(a,b,on="loinc_num"),c, on="loinc_num")

documents = documents[["loinc_num",	"long_common_name",	"component","system", "property"]]
documents.rename(columns={"loinc_num": 'docno' }, inplace=True)
documents.docno = documents.docno.astype(str)

In [44]:
documents

Unnamed: 0,docno,long_common_name,component,system,property
0,1988-5,C reactive protein [Mass/volume] in Serum or Plasma,C reactive protein,Ser/Plas,MCnc
1,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc
2,10331-7,Rh [Type] in Blood,Rh,Bld,Type
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc
...,...,...,...,...,...
62,54439-5,Calcium bilirubinate/Total in Stone,Calcium bilirubinate/Total,Calculus,MFr
63,18878-9,Cefazolin [Susceptibility],Cefazolin,Isolate,Susc
64,18928-2,Gentamicin [Susceptibility],Gentamicin,Isolate,Susc
65,29265-6,Calcium [Moles/volume] corrected for albumin in Serum or Plasma,Calcium^^corrected for albumin,Ser/Plas,SCnc


In [45]:
#qrel is for relevance judgement

qrel_q1 = a[["loinc_num", "final_label"]].copy()
qrel_q1['qid'] = "1"
qrel_q2 = b[["loinc_num", "final_label"]].copy()
qrel_q2['qid'] = "2"
qrel_q3 = c[["loinc_num", "final_label"]].copy()
qrel_q3['qid'] = "3"

qrels = pd.concat([qrel_q1, qrel_q2, qrel_q3], axis=0, ignore_index=True)[["qid", "loinc_num", "final_label"]]
qrels.rename(columns={"loinc_num": 'docno' }, inplace=True)

In [46]:
qrels

Unnamed: 0,qid,docno,final_label
0,1,1988-5,0
1,1,1959-6,0
2,1,10331-7,0
3,1,18998-5,0
4,1,1975-2,0
...,...,...,...
196,3,26484-6,1
197,3,1250-0,0
198,3,18864-9,0
199,3,1742-6,0


In [47]:
def data_cleaning(df, column, output_col):

    nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
    nlp.max_length=5000000
    #lowercase
    df[column] = df[column].apply(str.lower)
    # Stopwords removal & Lemmatizing tokens using SpaCy
    df[output_col]= df[column].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (not token.is_stop and not token.is_punct)]))
    df[output_col]= df[output_col].apply(lambda x: re.sub(r"[\.\,\#_\|\:\?\?\/\=]", ' ',x))
    df[output_col]= df[output_col].apply(lambda x: re.sub(' +',' ',x))
  
    return df

In [48]:
cleaned_doc_df = data_cleaning(documents, column='long_common_name', output_col='lemmatized')

In [49]:
index_dir = './INDEX'
indexer = pt.DFIndexer(index_dir, overwrite=True, blocks=True)
indexer.setProperty("stemmer", "")
index = indexer.index(cleaned_doc_df['lemmatized'], cleaned_doc_df["docno"])


In [50]:
cleaned_doc_df.shape

(67, 6)

In [51]:
queries_df = data_cleaning(queries_df, column='query', output_col='query')

In [52]:
def feature_generator(algorithm:str, queries_df):
    ranker = pt.BatchRetrieve(index, wmodel=algorithm, properties={"termpipelines" : ""},num_results=400)
    features = ranker(queries_df)[['qid', 'query', 'docno', 'score']]
    features.rename(columns={'score':f'{algorithm}_score'}, inplace=True)
    return features


In [53]:
BM25_feature = feature_generator("BM25", queries_df)
PL2_feature = feature_generator("PL2", queries_df)
DPH_feature = feature_generator("DPH", queries_df)
DirichletLM_feature = feature_generator("DirichletLM", queries_df)
TF_IDF_feature = feature_generator("TF_IDF", queries_df)

In [54]:
features = BM25_feature.merge(PL2_feature)
features= features.merge(DPH_feature)
features= features.merge(DirichletLM_feature)
features= features.merge(TF_IDF_feature)

In [55]:
features['qid'] = features['qid'].apply(int)
features['qid'] = features['qid'].astype(str)

In [56]:
_dict = defaultdict(list)
for doc in documents['docno']:
    for _id, query in enumerate(features["query"].unique()):
        if doc not in list(features[features["query"] == query]['docno']):
            _dict['qid'].append(str(_id+1))
            _dict['docno'].append(doc)
            for score in [col for col in features if col.endswith('score')]:
                _dict[score].append(0)
            _dict['query'].append(query)

temp_df = pd.DataFrame(_dict)
features = pd.concat([features, temp_df])

In [57]:
qrels

Unnamed: 0,qid,docno,final_label
0,1,1988-5,0
1,1,1959-6,0
2,1,10331-7,0
3,1,18998-5,0
4,1,1975-2,0
...,...,...,...
196,3,26484-6,1
197,3,1250-0,0
198,3,18864-9,0
199,3,1742-6,0


In [58]:
SEED=0
from sklearn.model_selection import train_test_split
datset = features.copy()
datset = datset.merge( qrels)
train, test = train_test_split(datset, test_size=0.4, random_state=SEED)
train.sort_values(by='qid', inplace=True)
qid_train = train['qid'].apply(int).to_numpy()
y_train = train['final_label'].to_numpy()
X_train = train[[col for col in train if col.endswith('score')]].to_numpy()
test.sort_values(by='qid', inplace=True)
qid_test = test['qid'].apply(int).to_numpy()
y_test = test['final_label'].to_numpy()
X_test = test[[col for col in test if col.endswith('score')]].to_numpy()

#dump_svmlight_file(X_train,y_train,'dataset.dat',query_id=qid_train)

In [59]:
from adarank import AdaRank
from metrics import NDCGScorer

scorer = NDCGScorer()
model = AdaRank(max_iter=200, estop=50, scorer=scorer)
model.fit(X_train, y_train, qid_train)
pred = model.predict(X_test, qid_test)
print (scorer(y_test, pred, qid_test).mean())

0.8333333333333334


In [60]:
results = pd.DataFrame({'qid':qid_test,'docno': test.docno, 'label':y_test, 'predicted':pred})

In [61]:
#  final retrieved documents by AdaRank
qid = datset['qid'].apply(int).to_numpy()
y = datset['final_label'].to_numpy()
X = datset[[col for col in train if col.endswith('score')]].to_numpy()
pred = model.predict(X, qid)
results = pd.DataFrame({'qid':qid,'docno': datset.docno, 'final_label':y, 'predicted':pred})


### Results

#### BaseLine results (BM25)

In [62]:
# BM25 reults for query 1
BM25 = pt.BatchRetrieve(index, wmodel="BM25", properties={"termpipelines" : ""},num_results=400)
quer_1 =queries_df.query("qid=='1'")['query'][1]
print(f"For the query  {quer_1}: the top 5 retrived documents are")
res1_bm25 = (BM25 %5)(quer_1)
documents.query(f'docno in {res1_bm25.docno.to_list()}')

For the query  glucose blood: the top 5 retrived documents are


  res = self.transformer.transform(topics_and_res)


Unnamed: 0,docno,long_common_name,component,system,property,lemmatized
12,15076-3,glucose [moles/volume] in urine,Glucose,Urine,SCnc,glucose mole volume urine
18,14749-6,glucose [moles/volume] in serum or plasma,Glucose,Ser/Plas,SCnc,glucose mole volume serum plasma
19,14747-0,glucose [moles/volume] in pleural fluid,Glucose,Plr fld,SCnc,glucose mole volume pleural fluid
23,74774-1,"glucose [mass/volume] in serum, plasma or blood",Glucose,Ser/Plas/Bld,MCnc,glucose mass volume serum plasma blood
28,14764-5,glucose [moles/volume] in serum or plasma --3 hours post 100 g glucose po,Glucose^3H post 100 g glucose PO,Ser/Plas,SCnc,glucose mole volume serum plasma --3 hour post 100 g glucose po


In [63]:
# BM25 reults for query 3
quer_2 =queries_df.query("qid=='2'")['query'][2]
print(f"For the query  {quer_2}: the top 5 retrived documents are")
res2_bm25 = (BM25 %5)(quer_2)

documents.query(f'docno in {res2_bm25.docno.to_list()}')

For the query  bilirubin plasma: the top 5 retrived documents are


  res = self.transformer.transform(topics_and_res)


Unnamed: 0,docno,long_common_name,component,system,property,lemmatized
4,1975-2,bilirubin.total [mass/volume] in serum or plasma,Bilirubin,Ser/Plas,MCnc,bilirubin total mass volume serum plasma
21,1968-7,bilirubin.direct [mass/volume] in serum or plasma,Bilirubin.glucuronidated+Bilirubin.albumin bound,Ser/Plas,MCnc,bilirubin direct mass volume serum plasma
26,14423-8,bilirubin.total [mass/volume] in synovial fluid,Bilirubin,Synv fld,MCnc,bilirubin total mass volume synovial fluid
32,1971-1,bilirubin.indirect [mass/volume] in serum or plasma,Bilirubin.non-glucuronidated,Ser/Plas,MCnc,bilirubin indirect mass volume serum plasma
44,33870-7,bilirubin.total [presence] in unspecified specimen,Bilirubin,XXX,PrThr,bilirubin total presence unspecified speciman


In [64]:
# BM25 reults for query 3
quer_3 =queries_df.query("qid=='3'")['query'][3]
print(f"For the query  {quer_3}: the top 5 retrived documents are")
res3_bm25 = (BM25 %5)(quer_3)
documents.query(f'docno in {res3_bm25.docno.to_list()}')

For the query  white blood cell count: the top 5 retrived documents are


  res = self.transformer.transform(topics_and_res)


Unnamed: 0,docno,long_common_name,component,system,property,lemmatized
2,10331-7,rh [type] in blood,Rh,Bld,Type,rh type blood
14,26474-7,lymphocytes [#/volume] in blood,Lymphocytes,Bld,NCnc,lymphocyte volume blood
22,26464-8,leukocytes [#/volume] in blood,Leukocytes,Bld,NCnc,leukocyte volume blood
33,26484-6,monocytes [#/volume] in blood,Monocytes,Bld,NCnc,monocyte volume blood
47,14578-9,abo group [type] in blood from blood product unit,ABO group,Bld^BPU,Type,abo group type blood blood product unit


#### AdaRank results

In [65]:
# AdaRanak reults for query 1
quer_1 =queries_df.query("qid=='1'")['query'][1]
print(f"For the query  {quer_1}: the top 5 retrived documents are")
res1 = results.query('qid==1').sort_values(by='predicted', ascending=False)[:5]
print(res1)
documents.query(f'docno in {res1.docno.to_list()}')

For the query  glucose blood: the top 5 retrived documents are
   qid    docno  final_label  predicted
0    1  74774-1            1   8.032789
1    1  15076-3            0   6.563036
2    1  14749-6            1   6.035368
3    1  14747-0            0   6.035368
4    1  14764-5            1   5.967340


Unnamed: 0,docno,long_common_name,component,system,property,lemmatized
12,15076-3,glucose [moles/volume] in urine,Glucose,Urine,SCnc,glucose mole volume urine
18,14749-6,glucose [moles/volume] in serum or plasma,Glucose,Ser/Plas,SCnc,glucose mole volume serum plasma
19,14747-0,glucose [moles/volume] in pleural fluid,Glucose,Plr fld,SCnc,glucose mole volume pleural fluid
23,74774-1,"glucose [mass/volume] in serum, plasma or blood",Glucose,Ser/Plas/Bld,MCnc,glucose mass volume serum plasma blood
28,14764-5,glucose [moles/volume] in serum or plasma --3 hours post 100 g glucose po,Glucose^3H post 100 g glucose PO,Ser/Plas,SCnc,glucose mole volume serum plasma --3 hour post 100 g glucose po


In [66]:

# AdaRanak reults for query 2
quer_2 =queries_df.query("qid=='2'")['query'][2]
print(f"For the query  {quer_2}: the top 5 retrived documents are")
res2 = results.query('qid==2').sort_values(by='predicted', ascending=False)[:5]
print(res2)
documents.query(f'docno in {res2.docno.to_list()}')

For the query  bilirubin plasma: the top 5 retrived documents are
    qid    docno  final_label  predicted
23    2  33870-7            1   6.035368
25    2   1968-7            1   5.952458
26    2   1971-1            1   5.952458
24    2   1975-2            1   5.952458
27    2  14423-8            0   5.586235


Unnamed: 0,docno,long_common_name,component,system,property,lemmatized
4,1975-2,bilirubin.total [mass/volume] in serum or plasma,Bilirubin,Ser/Plas,MCnc,bilirubin total mass volume serum plasma
21,1968-7,bilirubin.direct [mass/volume] in serum or plasma,Bilirubin.glucuronidated+Bilirubin.albumin bound,Ser/Plas,MCnc,bilirubin direct mass volume serum plasma
26,14423-8,bilirubin.total [mass/volume] in synovial fluid,Bilirubin,Synv fld,MCnc,bilirubin total mass volume synovial fluid
32,1971-1,bilirubin.indirect [mass/volume] in serum or plasma,Bilirubin.non-glucuronidated,Ser/Plas,MCnc,bilirubin indirect mass volume serum plasma
44,33870-7,bilirubin.total [presence] in unspecified specimen,Bilirubin,XXX,PrThr,bilirubin total presence unspecified speciman


In [69]:
# AdaRanak reults for query 3
quer_3 =queries_df.query("qid=='3'")['query'][3]
print(f"For the query  {quer_3}: the top 5 retrived documents are")
res3 = results.query('qid==3').sort_values(by='predicted', ascending=False)[:5]
print(res3)
documents.query(f'docno in {res3.docno.to_list()}')

For the query  white blood cell count: the top 5 retrived documents are
    qid    docno  final_label  predicted
56    3  14578-9            0   3.261670
58    3  26474-7            1   3.149735
59    3  26464-8            1   3.149735
60    3  26484-6            1   3.149735
61    3    933-2            0   3.149735


Unnamed: 0,docno,long_common_name,component,system,property,lemmatized
14,26474-7,lymphocytes [#/volume] in blood,Lymphocytes,Bld,NCnc,lymphocyte volume blood
22,26464-8,leukocytes [#/volume] in blood,Leukocytes,Bld,NCnc,leukocyte volume blood
33,26484-6,monocytes [#/volume] in blood,Monocytes,Bld,NCnc,monocyte volume blood
47,14578-9,abo group [type] in blood from blood product unit,ABO group,Bld^BPU,Type,abo group type blood blood product unit
50,933-2,blood product type,Blood product type,^BPU,Type,blood product type
