## Model Download

Dapprima scarichiamo il modello: [PROP_step400k base](https://drive.google.com/file/d/1aw0s1UK8PvZCI9R8hA9b7kxoN0x35kRr/view?usp=sharing)

Poi scarichiamo i dati per **MS MARCO** dal sito ufficiale TREC [website](https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz).

Ed infine scarichiamo BERT: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz

**Il codice seguente è esemplificativo a supporto della spiegazione, non è creato per essere eseguito**

In [31]:
import torch
from tqdm import tqdm
import pandas as pd
import json

from pytorch_pretrain_bert.modeling import PROP, BertConfig
from pytorch_pretrain_bert.tokenization import BertTokenizer
from pytorch_pretrain_bert.optimization import BertAdam, warmup_linear

In [None]:
!head -n 2000 MSMARCO-Inputs/msmarco-docs.tsv > MSMARCO-Inputs/msmarco-docs-short.tsv

In [32]:
bert_type = "bert-base-uncased"
prop_path = "./prop_msmarco_step400k_base.bin"

In [33]:
model = PROP.from_pretrained(bert_type)
tokenizer = BertTokenizer.from_pretrained(bert_type)

In [40]:
df = pd.read_csv("./MSMARCO-Inputs/msmarco-docs-short.tsv", delimiter="\t", header=None)

with open("./preprocessed_data", "w") as preprocessed_data:
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        docid = row[0]
        doc_text = row[3]
        if pd.isna(doc_text):
            continue
        bert_tokenized_doc_text = tokenizer.tokenize(row[3])
        line = {"id": docid, "bert_tokenized_doc_text": bert_tokenized_doc_text}
        preprocessed_data.write(json.dumps(line)+"\n")


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 70.50it/s]


In [None]:
!export INPUT_FILE=./preprocessed_data
!export Bert_MODEL_DIR=bert-base-uncased
!export OUTPUT=./output

!python prop_pretraining/multiprocessing_generate_word_sets.py \
    --train_corpus $INPUT_FILE  \
    --do_lower_case \
    --bert_model $Bert_MODEL_DIR \
    --output_dir $OUTPUT \
    --epochs_to_generate 1 \
    --possion_lambda 3 \
    --rop_num_per_doc 10 \
    --num_workers 20 \
    --reduce_memory

In [None]:
def convert_example_to_features(example, tokenizer, max_seq_length):
    label = example["label"]
    tokens = example["tokens"]
    segment_ids = example["segment_ids"]
    masked_lm_labels = example["masked_lm_labels"]
    masked_lm_positions = example["masked_lm_positions"]

    
    assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)

    input_array = np.zeros(max_seq_length, dtype=np.int)
    input_array[:len(input_ids)] = input_ids

    mask_array = np.zeros(max_seq_length, dtype=np.int)
    mask_array[:len(input_ids)] = 1

    segment_array = np.zeros(max_seq_length, dtype=np.int)
    segment_array[:len(segment_ids)] = segment_ids

    lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
    lm_label_array[masked_lm_positions] = masked_label_ids

    features = InputFeatures(input_ids=input_array,
                             input_mask=mask_array,
                             segment_ids=segment_array,
                             lm_label_ids=lm_label_array,
                             label=label
                             )
    return features

In [None]:
model.load_state_dict(torch.load(prop_path))
model.eval()

In [None]:
input_ids, input_mask, segment_ids, label, lm_label_ids
pred = model(input_ids, segment_ids, input_mask, lm_label_ids, label)

### Passage Reranking

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("amberoad/bert-multilingual-passage-reranking-msmarco")

model = AutoModelForSequenceClassification.from_pretrained("amberoad/bert-multilingual-passage-reranking-msmarco")

In [33]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [50]:
from tqdm import tqdm
import pandas as pd
import json
import torch

In [51]:
df = pd.read_csv("./MSMARCO-Inputs/msmarco-docs-short.tsv", delimiter="\t", header=None)

In [62]:
for i in range(len(df)):
    row = df.iloc[i]
    query = row[2]
    text = row[3][:1000]
    encoded_input = torch.tensor(tokenizer.encode(text=query, text_pair=text, add_special_tokens=True, truncation=True)).unsqueeze(0)
    output = model(encoded_input)
    break
    

In [65]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.9339, -2.9712]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)