# Setup

In [None]:
!conda install -y gdown

In [None]:
%%capture
!pip install python-terrier -q
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git -q
!pip install -U sentence-transformers -q
!pip install --upgrade gensim

from sentence_transformers.util import cos_sim

from scipy import stats
from scipy.spatial import distance
from scipy.spatial.distance import cosine

import pickle
import random
import pyterrier as pt
import pandas as pd
import numpy as np
import torch
import json
import os
import re
import math
import nltk
nltk.download('punkt')

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
!gdown 1cfgOF6kP8brxI_dtMTwhMEBGwgWBHjjV # nostops_queries_23-03-2024

In [None]:
!gdown 1qKm6yxQ2KzSiGkNaJSYgxV4nC_MGPUka # nostops_data_23-03-2024

In [None]:
queries = pd.read_pickle("nostops_queries_23-03-24.pickle")
queries = queries.rename(columns = {"query": "query_raw", "query_preprocessed": "query"})
queries.head()

In [None]:
data = pd.read_pickle("nostops_data_23-03-24.pickle")
data = data.rename(columns = {"keluhan": "keluhan_raw", "keluhan_preprocessed": "keluhan"})
data.head()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Sentence Transformer

In [None]:
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

pminilm_doc = dict()
for index, line in data.iterrows():
    docno = line['docno']
    pminilm_doc[docno] = sentence_transformer.encode(line['keluhan_raw'])

pminilm_query = dict()
for index, line in queries.iterrows():
    qid = line['qid']
    pminilm_query[qid] = sentence_transformer.encode(line['query_raw'])

# T5 Encoder Model

In [None]:
from transformers import AutoTokenizer, T5EncoderModel
t5_tokenizer = AutoTokenizer.from_pretrained("castorini/doc2query-t5-base-msmarco")
t5_model = T5EncoderModel.from_pretrained("castorini/doc2query-t5-base-msmarco").to(device)

t5_doc = dict()
for index, line in data.iterrows():
    docno = line['docno']
    input_ids = t5_tokenizer(
        line['keluhan_raw'], return_tensors='pt', truncation=True, max_length=512
    ).input_ids.to(device)
    with torch.no_grad():
        outputs = t5_model(input_ids=input_ids)
        last_hidden_states = outputs.last_hidden_state
    t5_doc[docno] = torch.mean(last_hidden_states, dim=1).detach().cpu().numpy().flatten()
     
t5_query = dict()
for index, line in queries.iterrows():
    qid = line['qid']
    input_ids = t5_tokenizer(
        line['query_raw'], return_tensors='pt', truncation=True, max_length=512
    ).input_ids.to(device)
    with torch.no_grad():
        outputs = t5_model(input_ids=input_ids)
        last_hidden_states = outputs.last_hidden_state
    t5_query[qid] = torch.mean(last_hidden_states, dim=1).detach().cpu().numpy().flatten()

# Pretrained IndoBERT

In [None]:
from transformers import AutoTokenizer, BertModel
bert_tokenizer = AutoTokenizer.from_pretrained("stevenwh/indobert-base-p2-finetuned-mer-80k")
bert_model = BertModel.from_pretrained("stevenwh/indobert-base-p2-finetuned-mer-80k").to(device)

bert_doc = dict()
for index, line in data.iterrows():
    docno = line['docno']
    inputs = bert_tokenizer(
        f"[CLS] {line['keluhan_raw']} [SEP]", return_tensors='pt', truncation=True, max_length=512
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    bert_doc[docno] = outputs.pooler_output[0].detach().cpu().numpy().flatten()
    
bert_query = dict()
for index, line in queries.iterrows():
    qid = line['qid']
    inputs = bert_tokenizer(
        f"[CLS] {line['query_raw']} [SEP]", return_tensors='pt', truncation=True, max_length=512
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    bert_query[qid] = outputs.pooler_output[0].detach().cpu().numpy().flatten()

# Export

In [None]:
with open('pminilm_query_26-03-24.pickle', 'wb') as handle:
    pickle.dump(pminilm_query, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('pminilm_data_26-03-24.pickle', 'wb') as handle:
    pickle.dump(pminilm_doc, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('t5_query_26-03-24.pickle', 'wb') as handle:
    pickle.dump(t5_query, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('t5_data_26-03-24.pickle', 'wb') as handle:
    pickle.dump(t5_doc, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('bert_query_26-03-24.pickle', 'wb') as handle:
    pickle.dump(bert_query, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('bert_data_26-03-24.pickle', 'wb') as handle:
    pickle.dump(bert_doc, handle, protocol=pickle.HIGHEST_PROTOCOL)