# Setup

## Install & Import

In [None]:
!conda install -y gdown

In [2]:
!pip install python-terrier -q
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git -q
!pip install -U sentence-transformers -q
!pip install --upgrade gensim
!pip install rake-nltk

In [3]:
!pip install POT==0.4.0
from ot import emd2

In [None]:
!sudo apt-get install -y openjdk-11-jdk
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
from IPython.display import display
from xml.dom.minidom import parse, parseString
from pyterrier.measures import *

from sentence_transformers import CrossEncoder

from gensim.models import Word2Vec, FastText

from rake_nltk import Rake

from scipy import stats
from scipy.spatial import distance
from scipy.spatial.distance import cosine

from sklearn import preprocessing
from sklearn.model_selection import KFold

import xml.etree.ElementTree as et
import pickle
import random
import pyterrier as pt
import pandas as pd
import numpy as np
import xgboost as xgb
import torch
import json
import os
import re
import math
import nltk
nltk.download('punkt')

if not pt.started():
    pt.init(version='snapshot')

In [6]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7d6d7a1dafd0>

In [None]:
!gdown 1qKm6yxQ2KzSiGkNaJSYgxV4nC_MGPUka # nostops_data_23-03-2024
!gdown 1cfgOF6kP8brxI_dtMTwhMEBGwgWBHjjV # nostops_queries_23-03-2024
!gdown 1PrLtsWEKk5HVd0gqPet93T8XLoIx0S_o # nostops_qrels_23-03-2024
!gdown 1eJxAmV1o4bT6VdE1sqmaVRLTuK1zn-Qb # stopwords.csv

In [None]:
!gdown --folder https://drive.google.com/drive/folders/1QSRWJuEdODVFPsm_GxdLxQPGkaVfc-N1 # EMBEDDING

In [9]:
stops = pd.read_csv('stopwords.csv', header=None)[0].values.tolist()

In [10]:
data = pd.read_pickle("nostops_data_23-03-24.pickle")
queries = pd.read_pickle("nostops_queries_23-03-24.pickle")
qrels = pd.read_pickle("nostops_qrels_23-03-24.pickle")

In [11]:
t5_query = 'EMBEDDING/t5_query_26-03-24.pickle'
t5_doc = 'EMBEDDING/t5_data_26-03-24.pickle'
pminilm_query = 'EMBEDDING/pminilm_query_26-03-24.pickle'
pminilm_doc = 'EMBEDDING/pminilm_data_26-03-24.pickle'
bert_query = 'EMBEDDING/bert_query_26-03-24.pickle'
bert_doc = 'EMBEDDING/bert_data_26-03-24.pickle'

with open(t5_query, 'rb') as handle:
    t5_query_embeddings = pickle.load(handle)
with open(t5_doc, 'rb') as handle:
    t5_doc_embeddings = pickle.load(handle)
with open(pminilm_query, 'rb') as handle:
    pminilm_query_embeddings = pickle.load(handle)
with open(pminilm_doc, 'rb') as handle:
    pminilm_doc_embeddings = pickle.load(handle)
with open(bert_query, 'rb') as handle:
    bert_query_embeddings = pickle.load(handle)
with open(bert_doc, 'rb') as handle:
    bert_doc_embeddings = pickle.load(handle)

In [12]:
data.head()

Unnamed: 0,docno,keluhan,keluhan_preprocessed
0,DS-1,Mengapa Keringat Badan Sangat Berlebihan?. sel...,keringat badan selamat malam dokter fathurrosi...
1,DS-2,Mengapa Lengan Atas Sakit Walaupun Tidak Melak...,lengan sakit aktivitas siang dokter 4 lengan s...
2,DS-3,Berapa Batas Usia Kandungan Untuk Melakukan US...,batas usia kandungan usg batas usia kandungan ...
3,DS-4,Apakah Katarak Pada Bayi Harus Disembuhkan Den...,katarak bayi disembuhkan operasi selamat pagi ...
4,DS-5,Mengapa Badan Panas dan Ngilu Selama Berhari-h...,badan panas ngilu berhari nama ferini intan lu...


In [13]:
data = data.rename(columns = {"keluhan": "keluhan_raw", "keluhan_preprocessed": "keluhan"})
data.head()

Unnamed: 0,docno,keluhan_raw,keluhan
0,DS-1,Mengapa Keringat Badan Sangat Berlebihan?. sel...,keringat badan selamat malam dokter fathurrosi...
1,DS-2,Mengapa Lengan Atas Sakit Walaupun Tidak Melak...,lengan sakit aktivitas siang dokter 4 lengan s...
2,DS-3,Berapa Batas Usia Kandungan Untuk Melakukan US...,batas usia kandungan usg batas usia kandungan ...
3,DS-4,Apakah Katarak Pada Bayi Harus Disembuhkan Den...,katarak bayi disembuhkan operasi selamat pagi ...
4,DS-5,Mengapa Badan Panas dan Ngilu Selama Berhari-h...,badan panas ngilu berhari nama ferini intan lu...


In [14]:
queries.head()

Unnamed: 0,qid,query,query_preprocessed
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...
174,Q2,Telinga berdengung sebelah kiri. Selamat pagi ...,telinga berdengung sebelah kiri selamat pagi d...
308,Q3,Benjolan di sekitar kelamin. Sore dok..Maaf do...,benjolan kelamin sore dok maaf dok 4 mengalami...
472,Q4,Makanan untuk penderita penyakit lambung dan l...,makanan penderita penyakit lambung liver dok m...
624,Q5,Kebiasaan melamun atau berkhayal secara berleb...,kebiasaan melamun berkhayal dok pengidap malad...


In [15]:
queries = queries.rename(columns = {"query": "query_raw", "query_preprocessed": "query"})
queries.head()

Unnamed: 0,qid,query_raw,query
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...
174,Q2,Telinga berdengung sebelah kiri. Selamat pagi ...,telinga berdengung sebelah kiri selamat pagi d...
308,Q3,Benjolan di sekitar kelamin. Sore dok..Maaf do...,benjolan kelamin sore dok maaf dok 4 mengalami...
472,Q4,Makanan untuk penderita penyakit lambung dan l...,makanan penderita penyakit lambung liver dok m...
624,Q5,Kebiasaan melamun atau berkhayal secara berleb...,kebiasaan melamun berkhayal dok pengidap malad...


## Index

In [None]:
%%time
# input for PyTerrier indexer must be a dataframe containing "docno" and "text"

collection = data.rename(columns = {"keluhan_raw": "text"})

!rm -rf ./medical_ir_index
pd_indexer = pt.DFIndexer("./medical_ir_index", \
                          type = pt.index.IndexingType(1), \
                          tokeniser = "UTFTokeniser", \
                          stemmer = None, \
                          stopwords = None, \
                          blocks = True)
index_ref = pd_indexer.index(collection["keluhan"], collection)

collection.info()

In [17]:
index_fact = pt.IndexFactory.of(index_ref)
print(index_fact.getCollectionStatistics().toString())

Number of documents: 86723
Number of terms: 74200
Number of postings: 1924462
Number of fields: 0
Number of tokens: 2575525
Field names: []
Positions:   true



# Re-rank

In [18]:
CUT_OFF = 15

In [19]:
""" Model retrieval """
bm25 = pt.BatchRetrieve(index_ref, wmodel = "BM25")

In [None]:
crossmodel = CrossEncoder('cross-encoder/quora-roberta-base', max_length = 512, device='cuda:0')

def crossencoder_apply(df : pd.DataFrame):
    return crossmodel.predict(list(zip(df['query_raw'].values, df['text'].values)))

cross_encT = pt.apply.doc_score(crossencoder_apply, batch_size=128)

In [21]:
# feature adalah fungsi dari pasangan dokumen
# dan query --> feature(dokumen, query)
def avg_vector_embedding(model_embedding, tokenized_data):
    """
    Representasi sederhana dokumen sebagai rata-rata vektor
    hasil word embedding dari seluruh kata yang ada di dokumen tersebut
    """
    embedding = []
    vector_acc = np.zeros(VECTOR_SIZE)
    count = 0
    for word in tokenized_data:
        if word in model_embedding.wv:
            vector_acc += model_embedding.wv[word]
            count += 1
    if count > 0:
        vector_acc /= count
    embedding.append(vector_acc)
    return np.array(embedding)

def f_len_diff(text, query):
    """ Delta panjang dokumen & query """
    return abs(len(text.split()) - len(query.split()))

def f_jaccard(text, query):
    """ Jaccard Similarity """
    word_set_text = set(text)
    word_set_query = set(query)
    return len(word_set_text & word_set_query) / len(word_set_text | word_set_query)

def f_canberra(text_embedding, query_embedding):
    """ Canberra Distance """
    return distance.canberra(text_embedding, query_embedding)

def f_cosine(text_embedding, query_embedding):
    """ Cosine Similarity """
    return cosine(text_embedding, query_embedding)

def f_wmd(model, text, query):
    """ Word Mover Distance """
    word_text = [token for token in text.split() if token in model.wv.index_to_key]
    word_query = [token for token in query.split() if token in model.wv.index_to_key]
    DISTANT_VALUE = 1.0
    if not word_text or not word_query:
        distance = DISTANT_VALUE
    else:
        distance = model.wv.wmdistance(word_text, word_query)
    return distance

def f_common_distinct_words(text, query):
    """ Banyak term yang beririsan antara dokumen dan query """
    word_set_text = set(text.split())
    word_set_query = set(query.split())
    return len(word_set_text & word_set_query)

def extract_keywords(sequence):
    """ Ekstraksi keywords dengan RAKE """
    r = Rake(stops)
    r.extract_keywords_from_text(" ".join(sequence))
    return r.get_ranked_phrases()

def f_common_context(text, query, window=10):
    """ Gagasan utama query yang dikandung dokumen """
    earliest_words = query.split()[:window]
    latest_words = query.split()[-window:]

    earliest_context = extract_keywords(earliest_words)
    latest_context = extract_keywords(latest_words)

    query_context = set(earliest_context + latest_context)
    context_words = set()
    for phrase in query_context:
        words = set(phrase.split())
        for word in words:
            context_words.add(word)
    
    return len(set(text.split()) & context_words) / len(set(text.split()) | context_words)

def f_common_context_qd(text, query, window=10):
    """ Gagasan utama antara query dan dokumen """
    earliest_words = query.split()[:window]
    latest_words = query.split()[-window:]
    earliest_context = extract_keywords(earliest_words)
    latest_context = extract_keywords(latest_words)
    query_context = set(earliest_context + latest_context)
    
    earliest_words = text.split()[:window]
    latest_words = text.split()[-window:]
    latest_context = extract_keywords(latest_words)
    latest_context = extract_keywords(latest_words)
    text_context = set(earliest_context + latest_context)
    
    qcontext_words = set()
    for phrase in query_context:
        words = set(phrase.split())
        for word in words:
            qcontext_words.add(word)         
    
    tcontext_words = set()
    for phrase in text_context:
        words = set(phrase.split())
        for word in words:
            tcontext_words.add(word)
    
    return len(tcontext_words & qcontext_words) / len(tcontext_words | qcontext_words)

def f_soft_common_context(text, query, window=10):
    """ 
    Fitur Common Context versi soft, menggunakan model word embeddings
    dan mengembalikan skor WMD antara konten dokumen dan gagasan utama query
    """
    earliest_words = query.split()[:window]
    latest_words = query.split()[-window:]

    earliest_context = extract_keywords(earliest_words)
    latest_context = extract_keywords(latest_words)

    query_context = earliest_context + latest_context
    context_words = []
    for phrase in query_context:
        words = phrase.split()
        for word in words:
            context_words.append(word)

    return f_wmd(model_ft, text, " ".join(context_words))

def f_common_context_nowindow(text, query):
    """ 
    Fitur Common Context versi no window atau tanpa batasan 
    ukuran window untuk ekstraksi keywords gagasan utama
    """
    query_context = extract_keywords(query.split())
    
    context_words = set()
    for phrase in query_context:
        words = set(phrase.split())
        for word in words:
            context_words.add(word)
            
    return len(set(text.split()) & context_words) / len(set(text.split()) | context_words)

def f_soft_common_context_nowindow(text, query):
    """ 
    Fitur Soft Common Context versi no window atau tanpa batasan 
    ukuran window untuk ekstraksi keywords gagasan utama
    """
    query_context = extract_keywords(query.split())
    
    context_words = []
    for phrase in query_context:
        words = phrase.split()
        for word in words:
            context_words.append(word)
            
    return f_wmd(model_w2v, text, " ".join(context_words))

def combine_f(row):
    """
    Daftar fitur yang dikembalikan dapat diatur (comment/uncomment)
    sesuai dengan skenario yang ingin diuji
    """

    docno = row["docno"]
    qid = row["qid"]
    keluhan = row["keluhan"]
    query = row["query"]
    keluhan_raw = row["text"]
    query_raw = row["query_raw"]
    
    text_w2v_embedding = avg_vector_embedding(model_w2v, keluhan.split())
    query_w2v_embedding = avg_vector_embedding(model_w2v, query.split())
    text_ft_embedding = avg_vector_embedding(model_ft, keluhan.split())
    query_ft_embedding = avg_vector_embedding(model_ft, query.split())
    
    return np.array([
                    # f_len_diff(keluhan, query), \
                    # f_jaccard(keluhan.split(), query.split()), \
        
                    # f_cosine(text_w2v_embedding.flatten(), query_w2v_embedding.flatten()), \
                    f_cosine(text_ft_embedding.flatten(), query_ft_embedding.flatten()), \
        
                    # f_cosine(pminilm_doc_embeddings[docno], pminilm_query_embeddings[qid]), \
                    f_cosine(t5_doc_embeddings[docno], t5_query_embeddings[qid]), \
                    f_cosine(bert_doc_embeddings[docno], bert_query_embeddings[qid]), \
                    
                    # f_wmd(model_w2v, keluhan, query), \
                    # f_wmd(model_ft, keluhan, query), \

                    f_canberra(text_w2v_embedding.flatten(), query_w2v_embedding.flatten()), \
                    # f_canberra(text_ft_embedding.flatten(), query_ft_embedding.flatten()), \

                    # f_common_distinct_words(keluhan, query), \
                    # f_common_distinct_words(keluhan_raw, query_raw), \
                    
                    # f_common_context(keluhan_raw, query_raw), \
                    # f_soft_common_context(keluhan_raw, query_raw), \
        
                    # f_common_context_qd(keluhan_raw, query_raw), \
        
                    f_common_context_nowindow(keluhan_raw, query_raw), \
                    f_soft_common_context_nowindow(keluhan_raw, query_raw), \
                   ])

In [22]:
%%time

""" Model word embedding """
tokenized_corpus = []
for doc in data['keluhan']:
    tokenized_corpus.append(doc.split())
VECTOR_SIZE = 64
model_w2v = Word2Vec(tokenized_corpus, vector_size=VECTOR_SIZE, min_count=2, workers=1)
model_ft = FastText(tokenized_corpus, vector_size=VECTOR_SIZE, min_count=2, workers=1)

""" KFold """
FOLD = 5
kf = KFold(n_splits = FOLD, shuffle=True, random_state=42)

evaluation_results = pd.DataFrame()
metrics = dict()
metric_names = [P@10, P@5, "map", "recip_rank", nDCG@5]
model_names = ["BM25 Only", "BM25 >> LambdaMART XGBoost"]

""" Cross validation """
for train_idx, val_idx in kf.split(queries):
    train_queries, val_queries = queries.iloc[train_idx], queries.iloc[val_idx]

    more_features = pt.apply.doc_features(lambda row: combine_f(row))
    pipeline = (bm25 % CUT_OFF) >> pt.text.get_text(index_ref, ["text", "keluhan"]) >> (more_features ** bm25 ** cross_encT)

    lmart_xgb = xgb.sklearn.XGBRanker(objective = 'rank:ndcg',
                                  learning_rate = 0.1,
                                  gamma = 1.0,
                                  min_child_weight = 0.1,
                                  max_depth = 6,
                                  random_state = 42,
                                  tree_method='hist',
                                  device="cuda"
                            )

    lmart_xgb_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_xgb, form = "ltr")
    lmart_xgb_pipeline.fit(train_queries, qrels, val_queries, qrels)

    evaluation_results = pd.concat([evaluation_results, \
                                  pt.Experiment([(bm25 % CUT_OFF), lmart_xgb_pipeline], \
                                                val_queries, \
                                                qrels, \
                                                eval_metrics=metric_names, \
                                                names=model_names, \
                                                perquery=True)], \
                                  ignore_index=True)
    
""" Konversi """
for index, line in evaluation_results.iterrows():
    model_name = line['name']
    qid = line['qid']
    measure = line['measure']
    score = line['value']

    if measure in metrics:
        if model_name in metrics[measure]:
            metrics[measure][model_name][qid] = score
        else:
            metrics[measure][model_name] = {qid: score}
    else:
        metrics[measure] = {model_name: {qid: score}}

""" Metrik """
for metric_name in metrics:
    df_metric = pd.DataFrame(metrics[metric_name])
    baseline = df_metric['BM25 Only']
    rerank = df_metric['BM25 >> LambdaMART XGBoost']

    print(f"Rata-rata {metric_name} untuk baseline: {np.mean(baseline)}")
    print(f"Rata-rata {metric_name} untuk model rerank: {np.mean(rerank)}")
    print(stats.ttest_rel(baseline, rerank))

    print()

pt.apply:   0%|          | 0/5 [00:00<?, ?row/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  20%|██        | 1/5 [00:04<00:16,  4.06s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  40%|████      | 2/5 [00:07<00:10,  3.62s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  60%|██████    | 3/5 [00:10<00:06,  3.46s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  80%|████████  | 4/5 [00:13<00:03,  3.39s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 5/5 [00:15<00:00,  3.13s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:01<00:01,  1.90s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:03<00:00,  1.81s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:01<00:01,  1.88s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:03<00:00,  1.81s/row]
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


pt.apply:   0%|          | 0/5 [00:00<?, ?row/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  20%|██        | 1/5 [00:03<00:14,  3.53s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  40%|████      | 2/5 [00:06<00:09,  3.30s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  60%|██████    | 3/5 [00:09<00:06,  3.29s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  80%|████████  | 4/5 [00:13<00:03,  3.38s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 5/5 [00:15<00:00,  3.05s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.20s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:03<00:00,  1.91s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.22s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:03<00:00,  1.93s/row]
pt.apply:   0%|          | 0/5 [00:00<?, ?row/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  20%|██        | 1/5 [00:03<00:14,  3.68s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  40%|████      | 2/5 [00:07<00:10,  3.60s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  60%|██████    | 3/5 [00:10<00:06,  3.50s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  80%|████████  | 4/5 [00:14<00:03,  3.56s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 5/5 [00:16<00:00,  3.22s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.08s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:03<00:00,  1.99s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.08s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:03<00:00,  2.00s/row]
pt.apply:   0%|          | 0/5 [00:00<?, ?row/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  20%|██        | 1/5 [00:03<00:13,  3.38s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  40%|████      | 2/5 [00:06<00:10,  3.36s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  60%|██████    | 3/5 [00:10<00:06,  3.44s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  80%|████████  | 4/5 [00:13<00:03,  3.55s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 5/5 [00:15<00:00,  3.19s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.49s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:04<00:00,  2.24s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.51s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:04<00:00,  2.26s/row]
pt.apply:   0%|          | 0/5 [00:00<?, ?row/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  20%|██        | 1/5 [00:03<00:14,  3.70s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  40%|████      | 2/5 [00:07<00:10,  3.66s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  60%|██████    | 3/5 [00:10<00:07,  3.57s/row]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

pt.apply:  80%|████████  | 4/5 [00:14<00:03,  3.74s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 5/5 [00:16<00:00,  3.38s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.45s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:04<00:00,  2.13s/row]
pt.apply:   0%|          | 0/2 [00:00<?, ?row/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

pt.apply:  50%|█████     | 1/2 [00:02<00:02,  2.39s/row]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

pt.apply: 100%|██████████| 2/2 [00:04<00:00,  2.08s/row]

Rata-rata map untuk baseline: 0.19187186327527717
Rata-rata map untuk model rerank: 0.19727867296660762
TtestResult(statistic=-1.2966314113599546, pvalue=0.2015171152520387, df=44)

Rata-rata recip_rank untuk baseline: 0.8577777777777776
Rata-rata recip_rank untuk model rerank: 0.9507407407407407
TtestResult(statistic=-2.5052388765883453, pvalue=0.016012264688064305, df=44)

Rata-rata P@5 untuk baseline: 0.7377777777777776
Rata-rata P@5 untuk model rerank: 0.7111111111111111
TtestResult(statistic=0.9727469142278402, pvalue=0.3359966581354571, df=44)

Rata-rata P@10 untuk baseline: 0.6555555555555556
Rata-rata P@10 untuk model rerank: 0.6777777777777778
TtestResult(statistic=-1.3240045497689066, pvalue=0.19233890066310178, df=44)

Rata-rata nDCG@5 untuk baseline: 0.7448905273418782
Rata-rata nDCG@5 untuk model rerank: 0.755736646612477
TtestResult(statistic=-0.45263817410044166, pvalue=0.6530333136510893, df=44)

CPU times: user 8min 38s, sys: 2.06 s, total: 8min 40s
Wall time: 8min 28s




In [23]:
lmart_xgb.get_booster().get_score(importance_type='gain')

{'f0': 1.7124065160751343,
 'f1': 1.7774230241775513,
 'f2': 2.052743911743164,
 'f3': 3.1715705394744873,
 'f4': 2.1986701488494873,
 'f5': 1.9204702377319336,
 'f6': 2.7337446212768555,
 'f7': 1.4394539594650269}

In [24]:
lmart_xgb.get_booster().get_score(importance_type='weight')

{'f0': 19.0,
 'f1': 18.0,
 'f2': 16.0,
 'f3': 30.0,
 'f4': 9.0,
 'f5': 10.0,
 'f6': 35.0,
 'f7': 23.0}

# Error Analysis

Pada penelitian ini, diharapkan bahwa dokumen yang relevan ditempatkan pada urutan terawal di daftar ranking sehingga metrik utama yang digunakan adalah recip_rank

In [25]:
def get_serp_metadata(serp):
    return pd.merge(serp, qrels[['qid', 'docno', 'label']], how='left', on=['qid', 'docno'])

def get_relevant_serp(serp):
    relevant_docno = pd.merge(serp, qrels[(qrels['label'] == 1)], how='inner', on='docno')['docno'].unique()
    return serp[serp['docno'].isin(relevant_docno)]

def compare_rank_score(metric_scores):
    df_scores = pd.DataFrame(metric_scores)

    observed_queries = []
    for index, row in df_scores.iterrows():
        observed_queries.append(queries[queries['qid'] == index]['query_raw'].values[0])
    df_scores['query_raw'] = observed_queries

    print("Rerank < Baseline (Baseline > Rerank)")
    display(df_scores[df_scores['BM25 >> LambdaMART XGBoost'] < df_scores['BM25 Only']])
    print()
    print("Rerank > Baseline (Baseline < Rerank)")
    display(df_scores[df_scores['BM25 >> LambdaMART XGBoost'] > df_scores['BM25 Only']])

def get_observed_doc(observed_docno):
    return data[data['docno'] == observed_docno]['keluhan_raw'].values[0]
    
def get_observed_query(observed_qid):
    return queries[queries['qid'] == observed_qid]['query_raw'].values[0]

def display_observed_query(observed_qid):
    print(f"{observed_qid}: {get_observed_query(observed_qid)}")

def get_serp(observed_qid, model, get_qrels_label=True, k=15):
    model_result = model.transform(queries[queries['qid'] == observed_qid])
    if get_qrels_label:
        model_result = get_serp_metadata(model_result)
    top_metadata = model_result[model_result['rank'] < k].sort_values(by=['rank'])
    return top_metadata
    
def display_serp(observed_qid, model, get_qrels_label=True, k=15):
    display(get_serp(observed_qid, model, get_qrels_label, k))

In [26]:
from sklearn.model_selection import train_test_split
train_topics, val_topics = train_test_split(queries, test_size=0.2, random_state=2)
val_topics

Unnamed: 0,qid,query_raw,query
5580,Q47,"Mental Disorder atau tidak ?. Salam, Saya ada ...",mental disorder salam pendekatan gadis cantik ...
2442,Q20,Serangan panik dan sulit bernafas. Malam dokte...,serangan panik sulit bernafas malam dokter men...
5765,Q49,Apakah pusing merupakan efek dari sakit flu. S...,pusing efek sakit flu selamat sore dok irsan 2...
1951,Q16,Mengatasi Gangguan Penglihatan Usai Kena Belek...,mengatasi gangguan penglihatan kena belekan do...
5941,Q50,"Sariawan Dan Kaitannya Dengan Demam, Mual, Pus...",sariawan kaitannya demam mual pusing dok 1 min...
4058,Q34,"Mata Silinder. Dok saya mau nanya, awalnya mat...",mata silinder dok nanya mata minus trs kemarin...
1673,Q14,"Pakai Aloe Vera Malah Bikin Berjerawat, Mengap...",pakai aloe vera bikin berjerawat sore dok prod...
1823,Q15,Berbahayakah Makan Mi Instan Setiap Hari?. Dok...,berbahayakah makan mi instan dok berbahaya ngg...
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...


In [None]:
tokenized_corpus = []
for doc in data['keluhan']:
    tokenized_corpus.append(doc.split())
VECTOR_SIZE = 64
model_w2v = Word2Vec(tokenized_corpus, vector_size=VECTOR_SIZE, min_count=2, workers=1)
model_ft = FastText(tokenized_corpus, vector_size=VECTOR_SIZE, min_count=2, workers=1)

metric_names = [P@10, P@5, "map", "recip_rank", nDCG@5]
model_names = ["BM25 Only", "BM25 >> LambdaMART XGBoost"]

more_features = pt.apply.doc_features(lambda row: combine_f(row))
pipeline = (bm25 % CUT_OFF) >> pt.text.get_text(index_ref, ["text", "keluhan"]) >> (more_features ** bm25 ** cross_encT)

lmart_xgb2 = xgb.sklearn.XGBRanker(objective = 'rank:ndcg',
                              learning_rate = 0.1,
                              gamma = 1.0,
                              min_child_weight = 0.1,
                              max_depth = 6,
                              random_state = 42,
                              tree_method='hist',
                              device="cuda"
                        )

lmart_xgb_pipeline2 = pipeline >> pt.ltr.apply_learned_model(lmart_xgb2, form = "ltr")
lmart_xgb_pipeline2.fit(train_topics, qrels, val_topics, qrels)

evaluation_results2 = pt.Experiment([bm25 % CUT_OFF, lmart_xgb_pipeline2], \
                                          val_topics, \
                                          qrels, \
                                          eval_metrics=metric_names, \
                                          names=model_names, \
                                          perquery=True)

""" Konversi """
metrics2 = dict()
for index, line in evaluation_results2.iterrows():
    model_name = line['name']
    qid = line['qid']
    measure = line['measure']
    score = line['value']

    if measure in metrics2:
        if model_name in metrics2[measure]:
            metrics2[measure][model_name][qid] = score
        else:
            metrics2[measure][model_name] = {qid: score}
    else:
        metrics2[measure] = {model_name: {qid: score}}

In [None]:
# Contoh cek skor dan hasil re-ranking
metrics2['recip_rank']
compare_rank_score(metrics2['recip_rank'])
get_serp('Q16', lmart_xgb_pipeline2)