# Setup

## Install & Import

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
kamus_baku = '/content/drive/MyDrive/TUGAS AKHIR/kamus_baku.csv'
stopwords = '/content/drive/MyDrive/TUGAS AKHIR/stopwords.csv'
# !wget -O kamus_baku.csv "https://drive.google.com/uc?export=download&id=1iIyJqVBsZjlKVorvxWbLdkFkq1zu8Po-"
# !wget -O stopwords.csv "https://drive.google.com/uc?export=download&id=1eJxAmV1o4bT6VdE1sqmaVRLTuK1zn-Qb"

Mounted at /content/drive/


In [2]:
%%capture
!pip install python-terrier -q
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git -q
!pip install -U sentence-transformers -q
!pip install --upgrade gensim
!pip install PySastrawi
!pip install rake-nltk
!pip install POT==0.4.0

In [3]:
!wget -O full_dataset.xml https://lumbung.cs.ui.ac.id/f/ed0627b7fa8e4984a672/?dl=1
!wget -O qrels_all.csv https://lumbung.cs.ui.ac.id/f/d2379cc795e84ed39605/?dl=1

--2024-04-12 05:21:27--  https://lumbung.cs.ui.ac.id/f/ed0627b7fa8e4984a672/?dl=1
Resolving lumbung.cs.ui.ac.id (lumbung.cs.ui.ac.id)... 152.118.148.86
Connecting to lumbung.cs.ui.ac.id (lumbung.cs.ui.ac.id)|152.118.148.86|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://lumbung.cs.ui.ac.id/seafhttp/files/0f5b40da-d973-47cb-89a9-aefbf63b1bca/Salinan%20corpus_full_repaired.xml [following]
--2024-04-12 05:21:28--  https://lumbung.cs.ui.ac.id/seafhttp/files/0f5b40da-d973-47cb-89a9-aefbf63b1bca/Salinan%20corpus_full_repaired.xml
Reusing existing connection to lumbung.cs.ui.ac.id:443.
HTTP request sent, awaiting response... 200 OK
Length: 171930255 (164M) [application/octet-stream]
Saving to: ‘full_dataset.xml’


2024-04-12 05:21:56 (5.96 MB/s) - ‘full_dataset.xml’ saved [171930255/171930255]

--2024-04-12 05:21:56--  https://lumbung.cs.ui.ac.id/f/d2379cc795e84ed39605/?dl=1
Resolving lumbung.cs.ui.ac.id (lumbung.cs.ui.ac.id)... 152.118.148.86
Connecting

In [4]:
%%capture
from IPython.display import display
from xml.dom.minidom import parse, parseString
from pyterrier.measures import *

from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
from sentence_transformers import CrossEncoder, SentenceTransformer
from sentence_transformers.util import cos_sim

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from gensim.models import Word2Vec, FastText

from rake_nltk import Rake

from ot import emd2

from scipy import stats
from scipy.spatial import distance
from scipy.spatial.distance import cosine

from sklearn import preprocessing
from sklearn.model_selection import KFold

import xml.etree.ElementTree as et
import pickle
import random
import pyterrier as pt
import pandas as pd
import numpy as np
import xgboost as xgb
import torch
import json
import os
import re
import math
import nltk
nltk.download('punkt')

if not pt.started():
  pt.init(version='snapshot')

## Set Random Seed

In [5]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7e3a96406d10>

## Script of Stopwords

In [6]:
stops = pd.read_csv(stopwords, header=None)[0].values.tolist()

In [7]:
def remove_stopwords(text):
    tokens = text.lower().split()
    cleaned_tokens = [token for token in tokens if token not in stops]
    return (" ".join(cleaned_tokens))

## Script of Special Chars

In [8]:
# # sometimes, query containing special chars like /,', or ? will produce errors
# # https://github.com/terrier-org/pyterrier/issues/62
# def remove_specials(text):
#   if str(text) == "nan":
#     return " "
#   text = text.replace("?", ".")
#   return "".join([x if (x not in ['/', '\'', '(', ')', ":", '"', '-']) else " " for x in text])

In [9]:
# sometimes, query containing special chars like /,', or ? will produce errors
# https://github.com/terrier-org/pyterrier/issues/62
def remove_specials(text):
  pattern = re.compile('[\W_]+')
  return pattern.sub(' ', text)

## Script of Normalization

In [10]:
# script Kak Syifa

df_kamus_baku = pd.read_csv(kamus_baku, index_col=0)

def normalisasi_singkatan(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    token_after_normalized = []

    for token in tokens:
        if token in df_kamus_baku['singkatan'].values:
            token = df_kamus_baku.loc[df_kamus_baku['singkatan'] == token, 'asli'].values[0]
        token_after_normalized.append(token)

    normal_text = ' '.join(token_after_normalized)
    return normal_text

def normalisasi_kata_ulang(text):
    text = text.lower()
    # contoh: kira2 -> kira-kira
    for found in re.finditer(r'[A-z]+(2|2x)\b', text):
        found = found.group(0)
        replace = re.sub(r'(2|2x)', "", found)
        replace = replace + '-' + replace
        text = text.replace(found, replace)

    # contoh: apa apa -> apa-apa
    for found in re.finditer(r'\b([A-z]+)\b\s+\1', text):
        found = found.group(0)
        word = found.split(' ');
        word = word[0] + "-" + word[0]
        text = text.replace(found, word)
    return text

def normalisasi(question):
    question = normalisasi_singkatan(question)
    question = normalisasi_kata_ulang(question)

    return question

# Preprocess Data

In [11]:
%%time

xtree = et.parse("full_dataset.xml")
xroot = xtree.getroot()
rows = []

for node in xroot:
  id = node.find("ID").text.strip() if node.find("ID").text is not None else None
  url = node.find("URL").text.strip() if node.find("URL").text is not None else None
  tag = node.find("TAG").find("ITEM").text.strip() if node.find("TAG").text is not None else None
  keluhan = node.find("KELUHAN")
  keluhan_judul = keluhan.find("JUDUL").text.strip() if keluhan.find("JUDUL").text is not None else None
  keluhan_isi = keluhan.find("ISI").text.strip() if keluhan.find("ISI").text is not None else None
  keluhan_waktu = keluhan.find("WAKTU").text.strip() if keluhan.find("WAKTU").text is not None else None
  jawaban = node.find("JAWABAN")
  jawaban_isi = jawaban.find("ISI").text.strip() if jawaban.find("ISI").text is not None else None
  jawaban_waktu = jawaban.find("WAKTU").text.strip() if jawaban.find("WAKTU").text is not None else None

  rows.append({"docno": id, "url": url, "tag": tag, \
                "keluhan_judul": keluhan_judul, "keluhan_isi": keluhan_isi, "keluhan_waktu": keluhan_waktu, \
                "jawaban_isi": jawaban_isi, "jawaban_waktu": jawaban_waktu})

data = pd.DataFrame(rows, columns = ['docno', 'url', 'tag', 'keluhan_judul', 'keluhan_isi', 'keluhan_waktu', 'jawaban_isi', 'jawaban_waktu'])

CPU times: user 3.68 s, sys: 340 ms, total: 4.03 s
Wall time: 4.59 s


In [12]:
keluhan_judul_kosong = data[data["keluhan_judul"].isna()]
len(keluhan_judul_kosong)

102

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86731 entries, 0 to 86730
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   docno          86731 non-null  object
 1   url            86731 non-null  object
 2   tag            86731 non-null  object
 3   keluhan_judul  86629 non-null  object
 4   keluhan_isi    86106 non-null  object
 5   keluhan_waktu  24405 non-null  object
 6   jawaban_isi    85289 non-null  object
 7   jawaban_waktu  24405 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


## Format

### Judul

In [14]:
# data = data[["docno", "keluhan_judul"]].drop_duplicates()
# data.fillna(' ', inplace=True)
# data["keluhan"] = data["keluhan_judul"]
# data = data.rename(columns = {"keluhan_judul": "keluhan_raw"})
# data = data[["docno", "keluhan_raw", "keluhan"]]

### Judul + isi

In [15]:
data = data[["docno", "keluhan_judul", "keluhan_isi"]].drop_duplicates()
data.fillna(' ', inplace=True)
data["keluhan"] = data["keluhan_judul"] + " " + data["keluhan_isi"]
data['keluhan_raw'] = data["keluhan_judul"] + ". " + data["keluhan_isi"]
data = data[["docno", "keluhan_raw", "keluhan"]]

## Remove Special Chars

In [16]:
# %%time

# keluhan, keluhan_raw = [], []

# for index, line in data.iterrows():
#   keluhan.append(remove_specials(str(line['keluhan'])) if line['keluhan'] is not None else None)

#   # VERSI RAW JUGA SPECIAL CHARS-NYA PREPROCESSED
#   keluhan_raw.append(remove_specials(str(line['keluhan_raw'])) if line['keluhan_raw'] is not None else None)

# data['keluhan'] = keluhan
# data['keluhan_raw'] = keluhan_raw

In [17]:
%%time

keluhan = []

for index, line in data.iterrows():
  keluhan.append(remove_specials(str(line['keluhan'])) if line['keluhan'] is not None else None)

data['keluhan'] = keluhan

CPU times: user 9.97 s, sys: 32.3 ms, total: 10 s
Wall time: 10.2 s


## Stem

In [18]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [19]:
# %%time

# keluhan = []

# for index, line in data.iterrows():
#   keluhan.append(stemmer.stem(line['keluhan']) if line['keluhan'] is not None else None)

# data['keluhan'] = keluhan

## Normalize

In [20]:
# %%time

# keluhan = []

# for index, line in data.iterrows():
#   keluhan.append(normalisasi(line['keluhan']) if line['keluhan'] is not None else None)

# data['keluhan'] = keluhan

## Remove Stopwords

In [21]:
%%time

keluhan = []

for index, line in data.iterrows():
  keluhan.append(remove_stopwords(line['keluhan']) if line['keluhan'] is not None else None)

data['keluhan'] = keluhan

CPU times: user 1min, sys: 116 ms, total: 1min
Wall time: 1min 2s


## Drop Duplicates

In [22]:
data.drop_duplicates(inplace=True)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86731 entries, 0 to 86730
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   docno        86731 non-null  object
 1   keluhan_raw  86731 non-null  object
 2   keluhan      86731 non-null  object
dtypes: object(3)
memory usage: 2.0+ MB


## Drop Missing Values

In [24]:
data.dropna(subset=['keluhan'], inplace=True)

indexEmptyData = data[(data['keluhan'] == '') | (data['keluhan'] == ' ') | (data['keluhan'] == '.')].index
data.drop(indexEmptyData, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86723 entries, 0 to 86730
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   docno        86723 non-null  object
 1   keluhan_raw  86723 non-null  object
 2   keluhan      86723 non-null  object
dtypes: object(3)
memory usage: 2.6+ MB


# Preprocess Qrels

In [25]:
qrels = pd.read_csv('qrels_all.csv')
qrels.drop(columns=qrels.columns[0], axis=1, inplace=True)
qrels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6110 entries, 0 to 6109
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   qid     6110 non-null   object
 1   docno   6110 non-null   object
 2   qtitle  6110 non-null   object
 3   qbody   5967 non-null   object
 4   dtitle  6110 non-null   object
 5   dbody   6110 non-null   object
 6   label   6110 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 334.3+ KB


## Label

*   Nonrelevant --> 0
*   Relevant or patial --> 1



In [26]:
label = []

for index, line in qrels.iterrows():
  label.append(1 if line['label'] == 2 else line['label'])

qrels["label"] = label

## Remove Special Chars

In [27]:
qtitles, qbodies, dtitles, dbodies = [], [], [], []

for index, line in qrels.iterrows():
  qtitles.append(remove_specials(str(line['qtitle']).strip()))
  qbodies.append(remove_specials(str(line['qbody']).strip()))
  dtitles.append(remove_specials(str(line['dtitle']).strip()))
  dbodies.append(remove_specials(str(line['dbody']).strip()))

qrels["qtitle_nospecials"] = qtitles
qrels["qbody_nospecials"] = qbodies
qrels["dtitle_nospecials"] = dtitles
qrels["dbody_nospecials"] = dbodies

## Drop Duplicates

In [28]:
qrels.drop_duplicates(inplace=True)

In [29]:
qrels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6110 entries, 0 to 6109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   qid                6110 non-null   object
 1   docno              6110 non-null   object
 2   qtitle             6110 non-null   object
 3   qbody              5967 non-null   object
 4   dtitle             6110 non-null   object
 5   dbody              6110 non-null   object
 6   label              6110 non-null   int64 
 7   qtitle_nospecials  6110 non-null   object
 8   qbody_nospecials   6110 non-null   object
 9   dtitle_nospecials  6110 non-null   object
 10  dbody_nospecials   6110 non-null   object
dtypes: int64(1), object(10)
memory usage: 525.2+ KB


## Format

### Judul

In [30]:
# qrels.drop_duplicates(inplace=True)
# qrels.fillna(' ', inplace=True)
# qrels["query_nospecials"] = qrels["qtitle_nospecials"]
# qrels["doc_nospecials"] = qrels["dtitle_nospecials"]
# qrels = qrels.rename(columns = {"qtitle": "query_raw", "dtitle": "doc_raw"})
# qrels = qrels[["qid", "docno", "query_raw", "query_nospecials", "doc_raw", "doc_nospecials", "label"]]

### Judul + isi

In [31]:
# # VERSI RAW JUGA SPECIAL CHARS-NYA PREPROCESSED

# qrels.drop_duplicates(inplace=True)
# qrels.fillna(' ', inplace=True)

# qrels["query_nospecials"] = qrels["qtitle_nospecials"] + " " + qrels["qbody_nospecials"]
# qrels["query_raw"] = qrels["qtitle_nospecials"] + ". " + qrels["qbody_nospecials"]

# qrels["doc_nospecials"] = qrels["dtitle_nospecials"] + " " + qrels["dbody_nospecials"]
# qrels["doc_raw"] = qrels["dtitle_nospecials"] + ". " + qrels["dbody_nospecials"]

# qrels = qrels[["qid", "docno", "query_raw", "query_nospecials", "doc_raw", "doc_nospecials", "label"]]

# qrels.head()

In [32]:
# VERSI MASIH ADA YANG RAW (CONTAINING SPECIAL CHARS)

qrels.drop_duplicates(inplace=True)
qrels.fillna(' ', inplace=True)

qrels["query_nospecials"] = qrels["qtitle_nospecials"] + " " + qrels["qbody_nospecials"]
qrels["query_raw"] = qrels["qtitle"] + ". " + qrels["qbody"]

qrels["doc_nospecials"] = qrels["dtitle_nospecials"] + " " + qrels["dbody_nospecials"]
qrels["doc_raw"] = qrels["dtitle"] + ". " + qrels["dbody"]

qrels = qrels[["qid", "docno", "query_raw", "query_nospecials", "doc_raw", "doc_nospecials", "label"]]

qrels.head()

Unnamed: 0,qid,docno,query_raw,query_nospecials,doc_raw,doc_nospecials,label
0,Q1,KD-14969,"Cara mengatasi pusing, lemas dan hidung berair...",Cara mengatasi pusing lemas dan hidung berair ...,mimisan dan polip. malam dok-dok hidung saya m...,mimisan dan polip malam dok dok hidung saya mi...,0
1,Q1,KD-10780,"Cara mengatasi pusing, lemas dan hidung berair...",Cara mengatasi pusing lemas dan hidung berair ...,nyeri kepala dan lendir tidak sedap dari hidun...,nyeri kepala dan lendir tidak sedap dari hidun...,1
2,Q1,AD-11170,"Cara mengatasi pusing, lemas dan hidung berair...",Cara mengatasi pusing lemas dan hidung berair ...,demam pusing dan bercak-bercak merah pada ...,demam pusing dan bercak bercak merah pada kuli...,1
3,Q1,KD-34954,"Cara mengatasi pusing, lemas dan hidung berair...",Cara mengatasi pusing lemas dan hidung berair ...,common cold. dok . bermula pada hari jumat kem...,common cold dok bermula pada hari jumat kemari...,1
4,Q1,KD-42528,"Cara mengatasi pusing, lemas dan hidung berair...",Cara mengatasi pusing lemas dan hidung berair ...,ingus dari hidung. selamat malam dok akhir-a...,ingus dari hidung selamat malam dok akhir akhi...,1


# Preprocess Queries

In [33]:
queries = qrels[["qid", "query_raw", "query_nospecials"]].drop_duplicates()
queries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 0 to 5941
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   qid               45 non-null     object
 1   query_raw         45 non-null     object
 2   query_nospecials  45 non-null     object
dtypes: object(3)
memory usage: 1.4+ KB


## Preprocessing Variation

In [34]:
%%time
query, query_stemmed, query_nostopwords, query_normalized, \
query_stemmed_nostopwords, query_stemmed_normalized, query_normalized_nostopwords, \
query_stemmed_normalized_nostopwords = [], [], [], [], [], [], [], []

for index, line in queries.iterrows():
  query_stemmed.append(stemmer.stem(line['query_nospecials']))
  query_nostopwords.append(remove_stopwords(line['query_nospecials']))
  query_normalized.append(normalisasi(line['query_nospecials']))
queries['query_stemmed'] = query_stemmed
queries['query_nostopwords'] = query_nostopwords
queries['query_normalized'] = query_normalized

for index, line in queries.iterrows():
  query_stemmed_nostopwords.append(remove_stopwords(line['query_stemmed']))
  query_stemmed_normalized.append(normalisasi(line['query_stemmed']))
  query_normalized_nostopwords.append(remove_stopwords(line['query_normalized']))
queries['query_stemmed_nostopwords'] = query_stemmed_nostopwords
queries['query_stemmed_normalized'] = query_stemmed_normalized
queries['query_normalized_nostopwords'] = query_normalized_nostopwords

for index, line in queries.iterrows():
  query_stemmed_normalized_nostopwords.append(remove_stopwords(line['query_stemmed_normalized']))
queries['query_stemmed_normalized_nostopwords'] = query_stemmed_normalized_nostopwords

CPU times: user 687 ms, sys: 15.8 ms, total: 703 ms
Wall time: 704 ms


In [35]:
queries.head(1)

Unnamed: 0,qid,query_raw,query_nospecials,query_stemmed,query_nostopwords,query_normalized,query_stemmed_nostopwords,query_stemmed_normalized,query_normalized_nostopwords,query_stemmed_normalized_nostopwords
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",Cara mengatasi pusing lemas dan hidung berair ...,cara atas pusing lemas dan hidung air malam do...,mengatasi pusing lemas hidung berair malam dok...,cara mengatasi pusing lemas dan hidung berair ...,pusing lemas hidung air malam dok 2 badan pusi...,cara atas pusing lemas dan hidung air malam do...,mengatasi pusing lemas hidung berair malam dok...,pusing lemas hidung air malam dok 2 badan pusi...


In [36]:
queries = queries.rename(columns = {"query_nostopwords": "query"})
queries = queries[['qid', 'query_raw', 'query']]

In [37]:
queries.head()

Unnamed: 0,qid,query_raw,query
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...
174,Q2,Telinga berdengung sebelah kiri. Selamat pagi ...,telinga berdengung sebelah kiri selamat pagi d...
308,Q3,Benjolan di sekitar kelamin. Sore dok..Maaf do...,benjolan kelamin sore dok maaf dok 4 mengalami...
472,Q4,Makanan untuk penderita penyakit lambung dan l...,makanan penderita penyakit lambung liver dok m...
624,Q5,Kebiasaan melamun atau berkhayal secara berleb...,kebiasaan melamun berkhayal dok pengidap malad...


## Drop Duplicates

In [38]:
queries.drop_duplicates(inplace=True)

## Drop Missing Values

In [39]:
queries.dropna(subset=['query'], inplace=True)

indexEmptyData = queries[(queries['query'] == '') | (queries['query'] == ' ') | (queries['query'] == '.')].index
queries.drop(indexEmptyData, inplace=True)

queries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 0 to 5941
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   qid        45 non-null     object
 1   query_raw  45 non-null     object
 2   query      45 non-null     object
dtypes: object(3)
memory usage: 1.4+ KB


# Index

## Indexing

In [40]:
%%time
# input for PyTerrier indexer must be a dataframe containing "docno" and "text"

collection = data.rename(columns = {"keluhan_raw": "text"})

!rm -rf ./medical_ir_index
pd_indexer = pt.DFIndexer("./medical_ir_index", \
                          type = pt.index.IndexingType(1), \
                          tokeniser = "UTFTokeniser", \
                          stemmer = None, \
                          stopwords = None, \
                          blocks = True)
index_ref = pd_indexer.index(collection["keluhan"], collection)

collection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86723 entries, 0 to 86730
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   docno    86723 non-null  object
 1   text     86723 non-null  object
 2   keluhan  86723 non-null  object
dtypes: object(3)
memory usage: 2.6+ MB
CPU times: user 1min 28s, sys: 2.02 s, total: 1min 30s
Wall time: 1min 30s


## Index Overview

In [41]:
collection.head()

Unnamed: 0,docno,text,keluhan
0,DS-1,Mengapa Keringat Badan Sangat Berlebihan?. sel...,keringat badan selamat malam dokter fathurrosi...
1,DS-2,Mengapa Lengan Atas Sakit Walaupun Tidak Melak...,lengan sakit aktivitas siang dokter 4 lengan s...
2,DS-3,Berapa Batas Usia Kandungan Untuk Melakukan US...,batas usia kandungan usg batas usia kandungan ...
3,DS-4,Apakah Katarak Pada Bayi Harus Disembuhkan Den...,katarak bayi disembuhkan operasi selamat pagi ...
4,DS-5,Mengapa Badan Panas dan Ngilu Selama Berhari-h...,badan panas ngilu berhari nama ferini intan lu...


In [42]:
index_fact = pt.IndexFactory.of(index_ref)
print(index_fact.getCollectionStatistics().toString())

Number of documents: 86723
Number of terms: 74200
Number of postings: 1924462
Number of fields: 0
Number of tokens: 2575525
Field names: []
Positions:   true



# Evaluate Baseline

In [43]:
%%time

tf_idf = pt.BatchRetrieve(index_ref, wmodel = "TF_IDF")
bm25 = pt.BatchRetrieve(index_ref, wmodel = "BM25")

pt.Experiment(
    [tf_idf, bm25],
    topics=queries,
    qrels=qrels,
    eval_metrics = [R@10, R@5, P@10, P@5, "map", "recip_rank", nDCG@5],
    names=['TF_IDF', 'BM25']
)

CPU times: user 41.3 s, sys: 438 ms, total: 41.7 s
Wall time: 38.5 s


Unnamed: 0,name,R@10,R@5,P@10,P@5,map,recip_rank,nDCG@5
0,TF_IDF,0.180833,0.117926,0.671111,0.764444,0.461931,0.878519,0.770475
1,BM25,0.17427,0.113283,0.655556,0.737778,0.455594,0.857778,0.744891


# Export

In [44]:
data = data.rename(columns = {"keluhan": "keluhan_preprocessed", "keluhan_raw": "keluhan"})
data.head(10)

Unnamed: 0,docno,keluhan,keluhan_preprocessed
0,DS-1,Mengapa Keringat Badan Sangat Berlebihan?. sel...,keringat badan selamat malam dokter fathurrosi...
1,DS-2,Mengapa Lengan Atas Sakit Walaupun Tidak Melak...,lengan sakit aktivitas siang dokter 4 lengan s...
2,DS-3,Berapa Batas Usia Kandungan Untuk Melakukan US...,batas usia kandungan usg batas usia kandungan ...
3,DS-4,Apakah Katarak Pada Bayi Harus Disembuhkan Den...,katarak bayi disembuhkan operasi selamat pagi ...
4,DS-5,Mengapa Badan Panas dan Ngilu Selama Berhari-h...,badan panas ngilu berhari nama ferini intan lu...
5,DS-6,Mengapa Benjolan di Muka Tidak Kunjung Mengemp...,benjolan muka kunjung mengempis assalammualaik...
6,DS-7,Apa Perbedaan Anak Hiperaktif Dan Autis?. sela...,perbedaan anak hiperaktif autis selamat sore d...
7,DS-8,Apakah Hipertensi Bisa Kambuh Lagi?. selamat s...,hipertensi kambuh selamat sore dokter orang me...
8,DS-9,Tidur Miring Saat Hamil Apakah Berbahaya Bagi ...,tidur miring hamil berbahaya janin selamat sor...
9,DS-10,Apakah Amnesia Bisa Disembuhkan?. selamat sore...,amnesia disembuhkan selamat sore dokter salah ...


In [45]:
queries = queries.rename(columns = {"query": "query_preprocessed", "query_raw": "query"})
queries.head()

Unnamed: 0,qid,query,query_preprocessed
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...
174,Q2,Telinga berdengung sebelah kiri. Selamat pagi ...,telinga berdengung sebelah kiri selamat pagi d...
308,Q3,Benjolan di sekitar kelamin. Sore dok..Maaf do...,benjolan kelamin sore dok maaf dok 4 mengalami...
472,Q4,Makanan untuk penderita penyakit lambung dan l...,makanan penderita penyakit lambung liver dok m...
624,Q5,Kebiasaan melamun atau berkhayal secara berleb...,kebiasaan melamun berkhayal dok pengidap malad...


In [46]:
with open('nostops_data_23-03-24.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('nostops_qrels_23-03-24.pickle', 'wb') as handle:
    pickle.dump(qrels, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('nostops_queries_23-03-24.pickle', 'wb') as handle:
    pickle.dump(queries, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [47]:
queries.head(10)

Unnamed: 0,qid,query,query_preprocessed
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...
174,Q2,Telinga berdengung sebelah kiri. Selamat pagi ...,telinga berdengung sebelah kiri selamat pagi d...
308,Q3,Benjolan di sekitar kelamin. Sore dok..Maaf do...,benjolan kelamin sore dok maaf dok 4 mengalami...
472,Q4,Makanan untuk penderita penyakit lambung dan l...,makanan penderita penyakit lambung liver dok m...
624,Q5,Kebiasaan melamun atau berkhayal secara berleb...,kebiasaan melamun berkhayal dok pengidap malad...
796,Q6,Apakah depresi dapat kambuh?. Saya mhswi smst ...,depresi kambuh mhswi smst 4 salah poltekkes ko...
944,Q7,"Patah tulang dapat menyambung. Sore Dok, saya ...",patah tulang menyambung sore dok tulang yg pat...
1047,Q8,Darah atau flek saat awal kehamilan. Assalamua...,darah flek kehamilan assalamualaikum dok wanit...
1181,Q9,Penyebab warna kuning pada gigi. Assalamualaik...,penyebab warna kuning gigi assalamualaikumwara...
1288,Q10,"Jantung Berdebar, Apakah Tandanya Ada Masalah ...",jantung berdebar tandanya jantung dear dokter ...


# Precomputed Usage

In [48]:
root = '/content/drive/MyDrive/TUGAS AKHIR/'
data = root + 'nostops_data_23-03-24.pickle'
qrels = root + 'nostops_qrels_23-03-24.pickle'
queries = root + 'nostops_queries_23-03-24.pickle'

In [50]:
pd.read_pickle(queries)

Unnamed: 0,qid,query,query_preprocessed
0,Q1,"Cara mengatasi pusing, lemas dan hidung berair...",mengatasi pusing lemas hidung berair malam dok...
174,Q2,Telinga berdengung sebelah kiri. Selamat pagi ...,telinga berdengung sebelah kiri selamat pagi d...
308,Q3,Benjolan di sekitar kelamin. Sore dok..Maaf do...,benjolan kelamin sore dok maaf dok 4 mengalami...
472,Q4,Makanan untuk penderita penyakit lambung dan l...,makanan penderita penyakit lambung liver dok m...
624,Q5,Kebiasaan melamun atau berkhayal secara berleb...,kebiasaan melamun berkhayal dok pengidap malad...
796,Q6,Apakah depresi dapat kambuh?. Saya mhswi smst ...,depresi kambuh mhswi smst 4 salah poltekkes ko...
944,Q7,"Patah tulang dapat menyambung. Sore Dok, saya ...",patah tulang menyambung sore dok tulang yg pat...
1047,Q8,Darah atau flek saat awal kehamilan. Assalamua...,darah flek kehamilan assalamualaikum dok wanit...
1181,Q9,Penyebab warna kuning pada gigi. Assalamualaik...,penyebab warna kuning gigi assalamualaikumwara...
1288,Q10,"Jantung Berdebar, Apakah Tandanya Ada Masalah ...",jantung berdebar tandanya jantung dear dokter ...
