In [5]:
# https://github.com/naver/splade <= 다음 링크를 참조하였습니다
# sparse 폴더에 (git clone https://github.com/naver/splade)을 통해 splade 폴더를 생성하세요

import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from splade.splade.models.transformer_rep import Splade
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import json

In [7]:
#코드 동작에 필요한 필수적인 변수들을 담는 class입니다.
class args:
    DATA_PATH = "../data"
    DOCUMENT_NAME = "DOC_NQ_first64.tsv"
    QUERY_TRAIN_NAME = "GTQ_NQ_train.tsv"
    QUERY_DEV_NAME = "GTQ_NQ_dev.tsv"
    TOPK = 1000
    SHORT_INFERENCE = True #시간 관계상 모든 dev query를 inferece하지 않고, 100개만 inference하는 경우 사용합니다
    SAVE_PATH = "../data/inference"
    PRED_SAVE_NAME = "SPLADE_Baseline.json"
    METRIC_SAVE_NAME = "SPLADE_Result.json"
    MODEL_TYPE_OR_DIR = "naver/splade_v2_max" #아래 model list들을 참조하세요

#SPLADE는 다양한 버전이 존재합니다. 그중에서, v2의 기본 모델인 splade_v2_max를 사용할 예정입니다.
##### v2
# model_type_or_dir = "naver/splade_v2_max"
# model_type_or_dir = "naver/splade_v2_distil"
##### v2++
# model_type_or_dir = "naver/splade-cocondenser-selfdistil"
#model_type_or_dir = "naver/splade-cocondenser-ensembledistil"

In [8]:
#사용할 데이터들을 로드한 뒤 전처리합니다. 자세한 내용은 BM25_Baseline을 참조하세요

#======================== LOAD AND PREPROCESS ========================================#

document_corpus = pd.read_csv(f"{args.DATA_PATH}/{args.DOCUMENT_NAME}", sep="\t", dtype=str)
query_train_corpus = pd.read_csv(f"{args.DATA_PATH}/{args.QUERY_TRAIN_NAME}", sep="\t", dtype=str)
query_dev_corpus = pd.read_csv(f"{args.DATA_PATH}/{args.QUERY_DEV_NAME}", sep="\t", dtype=str)

def clean_text(text):
        text = text.replace("\n", "")
        text = text.replace("``", "")
        text = text.replace('"', "")
        text = text.replace('\'', "")
        return text.lower().strip()

#apply clean text
document_corpus['query'] = document_corpus['query'].apply(clean_text)
query_train_corpus['query'] = query_train_corpus['query'].apply(clean_text)
query_dev_corpus['query'] = query_dev_corpus['query'].apply(clean_text)

#convert to dict
document_corpus = dict(zip(document_corpus["oldid"], document_corpus['query'])) 
query_train_corpus = dict(zip(query_train_corpus["oldid"], query_train_corpus['query'])) 
query_dev_corpus = dict(zip(query_dev_corpus["oldid"], query_dev_corpus['query'])) 

#convert index to document oldid
index2oldid = {index: oldid for index, oldid in enumerate(document_corpus.keys())}

#======================== LOAD AND PREPROCESS ========================================#

In [9]:
#SPLADE 모델 및 tokenizer를 hugginface 로부터 load합니다
#해당 모델은 MS-Marco dataset으로 pretrained 되어있는 모델입니다.

model = Splade(args.MODEL_TYPE_OR_DIR, agg="max")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(args.MODEL_TYPE_OR_DIR)
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}

In [23]:
#모델 추론 및 시각화를 위해 함수 하나를 정의합니다

def return_rep_result(query=None,
                      doc=None,
                      model=model,
                      reverse_voc=reverse_voc):
    #query 또는 document를 입력으로 받은 뒤, Tuple[sparse_vector, bow_rep, len(bow_rep)]을 return합니다
    
    #query와 document 중 무엇이 들어왔는지 확인합니다
    rep = "d_rep" #if doc!=None else "q_rep"
    _input = doc if doc!=None else query
    
    #모델을 통과합니다
    with torch.no_grad():
        input_rep = model(d_kwargs=tokenizer(_input, return_tensors="pt"))[rep].squeeze()  # (sparse) doc rep in voc space, shape (30522,)

    #output vector 중 nonzero가 아닌 값들만 list형태로 저장합니다
    col = torch.nonzero(input_rep).squeeze().cpu().tolist()

    #list 형태로 저장한 col이 각각 어떤 vocab을 의미하고 있는지를 확인합니다
    weights = input_rep[col].cpu().tolist()
    d = {k: v for k, v in zip(col, weights)}
    sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    bow_rep = []
    for k, v in sorted_d.items():
        bow_rep.append((reverse_voc[k], round(v, 2)))
    
    return input_rep, bow_rep, len(bow_rep)

In [24]:
#예시로 document 하나를 넣은 뒤, 어떤 값들이 나오는지 확인합니다
#3만 차원의 값들 중 대부분이 0이기 때문에, dense 모델들에 비해 빠른 속도가 특징입니다.

doc = "Glass and Thermal Stress. Thermal Stress is created when one area of a glass pane gets hotter than an adjacent area. If the stress is too great then the glass will crack. The stress level at which the glass will break is governed by several factors."
sparse_rep, bow_rep, length = return_rep_result(doc=doc)

print("number of actual dimensions: ", length)
print("SPLADE BOW rep:\n", bow_rep)
print(sparse_rep[1980:2030]) #대부분의 값은 0이며, 특정 중요도 이상의 값들만 양수임을 확인할 수 있습니다.

number of actual dimensions:  78
SPLADE BOW rep:
 [('thermal', 2.46), ('glass', 2.39), ('stress', 2.32), ('crack', 1.86), ('stressed', 1.56), ('glasses', 1.43), ('pan', 1.43), ('cause', 1.33), ('break', 1.33), ('too', 1.2), ('create', 1.18), ('created', 1.05), ('window', 1.04), ('meaning', 1.04), ('generated', 1.02), ('hot', 0.99), ('area', 0.97), ('shatter', 0.96), ('heat', 0.92), ('formed', 0.92), ('caused', 0.92), ('when', 0.89), ('why', 0.87), ('happen', 0.83), ('collapse', 0.8), ('strike', 0.76), ('produced', 0.69), ('result', 0.68), ('hotter', 0.67), ('adjacent', 0.67), ('cooler', 0.65), ('factor', 0.64), ('determined', 0.64), ('do', 0.64), ('level', 0.63), ('because', 0.62), ('fracture', 0.6), ('material', 0.59), ('if', 0.56), ('materials', 0.56), ('difference', 0.55), ('and', 0.53), ('it', 0.52), ('than', 0.51), ('one', 0.51), ('factors', 0.51), ('temperature', 0.5), ('form', 0.48), ('occur', 0.47), ('related', 0.46), ('generate', 0.44), ('at', 0.44), ('later', 0.43), ('frame',

In [27]:
#우리 dataset인 NQ320K에 대하여, 임의 query, document에 대해 동일 작업을 수행해 비슷한 결과가 나오는지 확인합니다

target = np.random.randint(0,100)
answer_docid, train_query = list(query_train_corpus.items())[target]
answer_document = document_corpus[answer_docid]

query_sparse_rep, query_bow_rep, query_length = return_rep_result(query=train_query)
doc_sparse_rep, doc_bow_rep, doc_length = return_rep_result(doc=answer_document)

print(f"Query: {train_query}")
print(f"Answer DocID: {answer_docid}")
print(f"Answer Document: {answer_document}")

print("\n","="*40,"\n")

print(f"query len: {query_length}, document len: {doc_length}")
print("query    rep: ",query_bow_rep)
print("document rep: ",doc_bow_rep)

Query: who has the most title wins in wwe
Answer DocID: 65
Answer Document: list of wwe champions  the wwe championship is a professional wrestling world heavyweight championship in wwe , currently on the smackdown brand . it is the first world title established in wwe , having been introduced in 1963 as the world wide wrestling federation ( wwwf ) world heavyweight championship . the promotion was renamed world wrestling federation ( wwf ) in 1979 and ended its affiliation with the national wrestling alliance ( nwa ) in 1983 , with the title also renamed to reflect the changes . in 2001 , it was unified with the world championship ( formerly the wcw world heavyweight championship ) following the wwf s buyout of world championship wrestling ( wcw ) and became the undisputed wwf championship . in 2002 , the wwf was renamed world wrestling entertainment ( wwe ) and split its roster into two brands , raw and smackdown . the title , now renamed wwe championship , was then designated to the

In [None]:
document_corpus