# ==跟 Google Drive 連線==

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ==看一下 GPU 資訊==

In [None]:
!nvidia-smi

Sat Jan  8 11:48:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# ==安裝 Hugging Face-Transformer ==

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 12.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 486 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Atte

# ==匯入需要的套件==

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.utils.rnn as rnn_utils
import torch.optim as optim

from transformers import BertTokenizer
from transformers import BertForSequenceClassification

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION)

import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import random

2.7.0


# ==定義參數==

In [None]:
# 資料相關設定 (路徑請自行修改)
train_csv_path = '/content/drive/MyDrive/IR_hw6/Datas/train_queries_with_BM25.csv'
doc_csv_path = '/content/drive/MyDrive/IR_hw6/Datas/documents.csv'
test_csv_path = '/content/drive/MyDrive/IR_hw6/Datas/test_queries_with_BM25.csv'
# 輸入相關設定
max_seq_len = 512
num_neg_docs = 5 # 1 個正樣本要對應到幾個負樣本

# 模型相關設定
model_name = 'bert-base-uncased'
n_class = 2

# 訓練相關設定
learning_rate = 1e-5
batch_size = 12
epochs = 2
num_workers = 2

# 訓練和驗證 Query 筆數
num_train_query = 95
num_dev_query = 25

# 保存模型的資料夾 (路徑請自行修改)
weights_dir = '/content/drive/MyDrive/IR_hw6/weights_NEG{}'.format(num_neg_docs)

# ==設定 Random Seed==

In [None]:
# Random seed
SEED = 8888
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f73b866fd70>

# ==設定要用哪張GPU==

In [None]:
print('-'*30)
print('All GPU:')
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
print('-'*30)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)

------------------------------
All GPU:
Tesla K80
------------------------------
cuda:0


# ==讀入訓練資料==

In [None]:
# 訓練前 100 筆，倒數 20 筆做驗證
data_df = pd.read_csv(train_csv_path)
train_data_df = data_df[:num_train_query]
dev_data_df = data_df[num_train_query:num_train_query+num_dev_query]

train_data_df

Unnamed: 0,query_id,query_text,pos_doc_ids,bm25_top1000_docs,bm25_top1000_scores
0,381,altern medicin,FBIS3-22608 FBIS3-22653 FBIS3-46348 FBIS3-6040...,FR940819-2-00077 FBIS4-35025 FR941122-2-00061 ...,15.822446161346797 15.180961820914565 14.94272...
1,644,exot anim import,FR940317-1-00207 FR940317-1-00208 FR940317-1-0...,LA070190-0086 FT921-16211 FR940628-1-00012 LA0...,22.126743667831445 21.61858839336181 21.482188...
2,677,lean tower of pisa,FT921-8941 FT922-11710 FT923-14708 FT923-14710...,LA102089-0186 FT943-5813 FT921-8941 FT941-2727...,34.26448694760154 33.24805915650945 32.3726333...
3,698,literaci rate africa,FT941-12258 FT942-5993 FT943-11957,FT943-5793 LA101890-0069 LA061990-0091 FBIS3-3...,20.326126551642087 18.868811067290643 18.06107...
4,373,encrypt equip export,FR940111-2-00007 FR940111-2-00008 FR940111-2-0...,FT933-10240 FT943-5307 FR940902-0-00008 FT943-...,25.967172532524998 23.009039311307234 22.97709...
...,...,...,...,...,...
90,337,viral hepat,FBIS3-22547 FBIS3-22556 FBIS3-22614 FBIS3-2262...,FR940527-1-00166 FBIS3-22685 FBIS3-22624 FBIS3...,32.3187191267442 30.778449056696267 28.8151993...
91,360,drug legal benefit,FBIS3-10289 FBIS3-10398 FBIS3-10996 FBIS3-1121...,LA032590-0032 LA031289-0044 FT943-12194 FBIS3-...,16.022772043683783 15.87243830742991 14.796182...
92,649,comput virus,FBIS3-40468 FBIS3-42979 FBIS3-43017 FBIS4-5044...,FT944-9024 FBIS4-50440 FBIS4-54715 LA101189-00...,23.171366575635773 23.09406263078247 22.894454...
93,375,hydrogen energi,FBIS3-20985 FBIS3-22869 FBIS3-24648 FBIS3-4038...,FBIS4-20560 FBIS4-66430 FBIS4-44635 FT944-1655...,20.626687935565684 20.12872077634161 20.002341...


In [None]:
dev_data_df

Unnamed: 0,query_id,query_text,pos_doc_ids,bm25_top1000_docs,bm25_top1000_scores
95,398,dismantl europ s arsen,FBIS3-16615 FBIS3-17892 FBIS3-19056 FBIS3-1994...,FT934-1218 FBIS3-39268 FT943-10873 FBIS3-59919...,22.316174026113885 19.94567063881363 19.464154...
96,683,czechoslovakia breakup,FBIS3-20851 FBIS3-30249 FBIS3-30856 FBIS3-4911...,FBIS4-32410 FBIS3-21555 LA081789-0098 LA112389...,20.396065544688177 16.88329613293826 14.520760...
97,387,radioact wast,FBIS3-15634 FBIS3-1564 FBIS3-17250 FBIS3-19535...,FBIS4-64772 FBIS4-67522 FR940919-0-00103 FR940...,21.36187632757415 21.267090779562537 21.154831...
98,319,new fuel sourc,FBIS3-19535 FBIS3-20666 FBIS3-20668 FBIS3-2067...,FBIS4-67889 FR940406-2-00138 FBIS3-29355 FBIS3...,13.74872146079385 13.620818087242752 13.560692...
99,611,kurd germani violenc,FBIS3-18686 FBIS3-19768 FBIS3-19811 FBIS3-1988...,FBIS3-20083 FBIS3-20057 FBIS3-20295 FBIS4-1822...,28.649956448857615 28.22656258250029 25.640005...
100,443,u s invest africa,FBIS3-1408 FBIS3-43135 FBIS3-43189 FBIS3-43485...,FT932-11100 FT924-6026 FBIS4-47993 FBIS4-18481...,13.891306248401968 13.64215715687617 13.159654...
101,448,ship loss,FT911-1248 FT921-11935 FT922-15088 FT922-1727 ...,FT943-1368 FT934-3853 FT931-11047 FT921-6247 F...,14.330006180850999 14.307175400947706 14.00773...
102,691,clear cut forest,FBIS3-22240 FBIS3-22442 FBIS4-67351 FR940127-1...,FBIS3-22240 FT922-4550 LA101890-0053 LA100990-...,20.7056077577736 19.777580960469823 19.0764510...
103,445,women clergi,FT921-11838 FT921-11847 FT921-11857 FT921-4820...,FT931-11616 FT942-2917 FT943-15291 LA020390-00...,22.386429898267224 21.530488033244307 20.62948...
104,419,recycl automobil tire,FBIS3-24648 FBIS3-42464 FBIS3-43002 FBIS3-4304...,FBIS3-41645 LA040190-0174 FBIS4-67400 FR940916...,28.02024572876838 23.90712475239058 22.5986694...


In [None]:
documents_df = pd.read_csv(doc_csv_path) 
documents_df

Unnamed: 0,doc_id,doc_text
0,FBIS3-22608,languag f p 105 thai f articl type bfn text th...
1,FBIS3-22653,languag f p 105 ukrainian f articl type bfn f ...
2,FBIS3-46348,languag f p 105 english f articl type bfn f p ...
3,FBIS3-60406,languag f p 105 english f articl type bfn text...
4,FBIS4-20452,93 pp 290 296 f p 103 40091007d f f p 104 beij...
...,...,...
99995,FT911-4300,ft 19 apr 91 intern compani news pfizer advanc...
99996,FBIS3-42705,languag f p 105 russian f articl type cso f p ...
99997,FT942-11913,ft 04 may 94 world news in brief lubber seek b...
99998,FR940705-0-00015,execut order 12372 thi final rule is exclud fr...


In [None]:
documents_dict = {}
import json
for row in documents_df.to_numpy():
    doc_id = str(row[0])
    doc_content = str(row[1])
    
    documents_dict[doc_id] = doc_content

print('Num documents: ', len(documents_dict))
print(json.dumps(documents_dict)[:100])

Num documents:  100000
{"FBIS3-22608": "languag f p 105 thai f articl type bfn text the deputi public health minist dr udom


# ==產生訓練正負樣本==

In [None]:
documents_ID = list(documents_dict.keys())

print('Num document IDs: ', len(documents_ID))
print(json.dumps(documents_ID)[:100])

Num document IDs:  100000
["FBIS3-22608", "FBIS3-22653", "FBIS3-46348", "FBIS3-60406", "FBIS4-20452", "FBIS4-45369", "FBIS4-51


* ### 隨機產生不在原本 pos_doc_list 裡的 doc 做負樣本

In [None]:
def gen_neg_docs(pos_docs_list, bm25_top1000_docs_list, documents_ID, num_neg_docs):
    neg_doc = bm25_top1000_docs_list[0] # 先隨便拿一個做初始化
    neg_docs_list = []
    
    for i in range(1, len(pos_docs_list)*num_neg_docs+1):
        while (neg_doc in pos_docs_list) or (neg_doc in neg_docs_list):
            if i < len(bm25_top1000_docs_list)-len(pos_docs_list):
                neg_doc = random.choice(bm25_top1000_docs_list) # 從 BM25 top 1000 中選負樣本
            else:
                neg_doc = random.choice(documents_ID) # 從全部文檔中選負樣本
            
        neg_docs_list.append(neg_doc)
        
    return neg_docs_list

In [None]:
train_pos_neg_data = [] # 保存所有的訓練正負樣本

for i in tqdm(range(len(train_data_df))):
    row = train_data_df.iloc[i]
    pos_docs_list = row['pos_doc_ids'].split() # 拿到此筆 Query 相關文檔的標準答案
    bm25_top1000_docs_list = row['bm25_top1000_docs'].split() # 拿到 BM25 的預測結果
    
    # 添加正樣本
    for docID in pos_docs_list:
        train_pos_neg_data.append({
            'query': row['query_text'],
            'doc': documents_dict[docID],
            'relevant': 1 # 相關
        })
       
    # 添加負樣本
    neg_docs_list = gen_neg_docs(
        pos_docs_list=pos_docs_list,
        bm25_top1000_docs_list=bm25_top1000_docs_list,
        documents_ID=documents_ID,
        num_neg_docs=num_neg_docs
    )
    for docID in neg_docs_list:
        train_pos_neg_data.append({
            'query': row['query_text'],
            'doc': documents_dict[docID],
            'relevant': 0 # 不相關
        })
        
print('Num Train Data: ', len(train_pos_neg_data))

100%|██████████| 95/95 [00:01<00:00, 90.28it/s] 

Num Train Data:  36072





# ==定義 BERT 使用的 tokenizer==

In [None]:
# 取得預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_name
)
print("字典大小：", len(tokenizer.vocab))

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

字典大小： 30522


In [None]:
print(json.dumps(tokenizer.vocab)[:300])

{"[PAD]": 0, "[unused0]": 1, "[unused1]": 2, "[unused2]": 3, "[unused3]": 4, "[unused4]": 5, "[unused5]": 6, "[unused6]": 7, "[unused7]": 8, "[unused8]": 9, "[unused9]": 10, "[unused10]": 11, "[unused11]": 12, "[unused12]": 13, "[unused13]": 14, "[unused14]": 15, "[unused15]": 16, "[unused16]": 17, 


# ==定義訓練 DataSet==

In [None]:
class TrainDataset(Dataset):
    def __init__(self, pos_neg_data, tokenizer, max_seq_len):
        self.pos_neg_data = pos_neg_data
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.pos_neg_data)
    
    def __getitem__(self, idx):
        # 建立 query 的 token 並在前後加入 [CLS] [SEP]
        query_wordpiece = ['[CLS]'] + self.tokenizer.tokenize(
            self.pos_neg_data[idx]['query']) + ['[SEP]']
        query_len = len(query_wordpiece)
        
        # 建立 doc 的 token
        # 要記得只能取前 max_seq_len 個單字，要不然會爆掉
        doc_wordpiece = self.tokenizer.tokenize(
            self.pos_neg_data[idx]['doc'])[:(self.max_seq_len-query_len-1)] + ['[SEP]']
        doc_len = len(doc_wordpiece)
        
        wordpiece = query_wordpiece + doc_wordpiece
        tokens_tensor = torch.tensor(
            self.tokenizer.convert_tokens_to_ids(wordpiece), dtype=torch.long
        )
        
        segments_tensor = torch.tensor(
            [0]*query_len + [1]*doc_len, dtype=torch.long
        )
        
        masks_tensor = torch.tensor(
            [1]*(query_len+doc_len), dtype=torch.long
        )
        
        label_tensor = torch.tensor(
            train_pos_neg_data[idx]['relevant'], dtype=torch.long
        )
        
        return tokens_tensor, segments_tensor, masks_tensor, label_tensor

In [None]:
def collate_fn(data):
    tokens_tensor, segments_tensor, masks_tensor, label_tensor = zip(*data)
    
    tokens_tensor = rnn_utils.pad_sequence(list(tokens_tensor), batch_first=True)
    segments_tensor = rnn_utils.pad_sequence(list(segments_tensor), batch_first=True)
    masks_tensor = rnn_utils.pad_sequence(list(masks_tensor), batch_first=True)
    label_tensor = torch.tensor(list(label_tensor))
    
    return tokens_tensor, segments_tensor, masks_tensor, label_tensor

In [None]:
train_set = TrainDataset(
    pos_neg_data=train_pos_neg_data, 
    tokenizer=tokenizer,
    max_seq_len=max_seq_len
)
train_loader = DataLoader(
    dataset=train_set, 
    batch_size=batch_size, 
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers
)

print('Num Train Data: ', len(train_set))

Num Train Data:  36072


# ==初始化 BERT 模型==

In [None]:
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_name, 
    num_labels=n_class
).to(device)

model.config

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
print('-'*20)
print('{:10} | {:}'.format('Name', 'Module'))
print('-'*20)
for name, module in model.named_children():
    if name == 'bert':
        for n, _ in module.named_children():
            print('{:11}: {}'.format(name, n))
    else:
        print('{:11}: {}'.format(name, module))

--------------------
Name       | Module
--------------------
bert       : embeddings
bert       : encoder
bert       : pooler
dropout    : Dropout(p=0.1, inplace=False)
classifier : Linear(in_features=768, out_features=2, bias=True)


# ==訓練模型==

* ## 定義優化方法

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

* ## 定義訓練函數

In [None]:
def train_one_epoch(train_loader, model, epoch, num_epochs, optimizer, device):
    running_loss = 0.0
            
    model.train()  # 訓練
    loop = tqdm(enumerate(train_loader), total=len(train_loader))
    for batch_idx, (tokens_tensor, segments_tensor, masks_tensor, label_tensor) in loop:
        
        tokens_tensor = tokens_tensor.to(device)
        segments_tensor = segments_tensor.to(device)
        masks_tensor = masks_tensor.to(device)
        label_tensor = label_tensor.to(device)
        
         # 把上次的梯度清掉
        optimizer.zero_grad()
        
        # 只有訓練的時候計算及更新梯度
        with torch.set_grad_enabled(True):
            '''
            TODO: 
                完成前向傳播並計算模型的 Loss
            '''

            loss = model(input_ids= tokens_tensor, token_type_ids=segments_tensor, attention_mask=masks_tensor, labels=label_tensor)[0]### 1. insert_missing_code ###

            # 誤差反向傳播並更新參數
            loss.backward()
            optimizer.step()

        # 計算總誤差
        running_loss += loss.item() * tokens_tensor.size(0)
        
        # 設定 tqdm 要顯示的東西
        loop.set_description(f"[Train Epoch {epoch}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())
        
    epoch_loss = running_loss / len(train_loader.dataset)
    
    return epoch_loss

In [None]:
def train_model(model, train_loader, optimizer, device, num_epochs, file_path):
    since = time.time()

    for epoch in range(1, num_epochs+1):
        print('=' * 70)
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('=' * 70)
        
        epoch_loss = train_one_epoch(
            train_loader=train_loader, 
            model=model, 
            epoch=epoch, 
            num_epochs=num_epochs, 
            optimizer=optimizer, 
            device=device
        )

        time_elapsed = time.time() - since
        print('Time elapsed {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
        print('Loss: {:.4f}'.format(epoch_loss))
        print('-' * 70)
                
        # 保存模型 
        torch.save(model.state_dict(), os.path.join(file_path, 'checkpoint_{}.pth'.format(epoch)))
                
        print('-' * 70)
        print('*' * 70)
        print('\n\n\n')
        

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

In [None]:
# 生成保存模型的目錄
if os.path.exists(weights_dir):
    shutil.rmtree(weights_dir)
    
os.mkdir(weights_dir)


# 訓練模型
train_model(
    model=model, 
    train_loader=train_loader, 
    optimizer=optimizer, 
    device=device, 
    num_epochs=epochs, 
    file_path=weights_dir
)

Epoch 1/2


[Train Epoch 1/2]: 100%|██████████| 3280/3280 [1:51:12<00:00,  2.03s/it, loss=0.0953]


Time elapsed 111m 13s
Loss: 0.2999
----------------------------------------------------------------------
----------------------------------------------------------------------
**********************************************************************




Epoch 2/2


[Train Epoch 2/2]: 100%|██████████| 3280/3280 [1:51:37<00:00,  2.04s/it, loss=0.258]


Time elapsed 222m 53s
Loss: 0.1984
----------------------------------------------------------------------
----------------------------------------------------------------------
**********************************************************************




Training complete in 222m 56s


In [None]:
# Save the weights
model.save_weights('./checkpoints/my_checkpoint')

# Create a new model instance
model = create_model()

# Restore the weights
model.load_weights('./checkpoints/my_checkpoint')

# Evaluate the model
loss, acc = model.evaluate(test_images, test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

AttributeError: ignored

# ==驗證模型==

* ### 載入驗證資料

In [None]:
dev_data = []

for i in range(len(dev_data_df)):
    row = dev_data_df.iloc[i]
    #print(row)
    for doc_id in row['bm25_top1000_docs'].split():
        dev_data.append({
            'query_id': row['query_id'],
            'query_text': row['query_text'],
            'doc_id': doc_id,
            'doc_text': documents_dict[doc_id]
        })
print(dev_data[0]) 
print('Num Dev Data: ', len(dev_data))

* ### 定義驗證 Dataset

In [None]:
class EvalDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        query_id = self.data[idx]['query_id']
        # 建立 query 的 token 並在前後加入 [CLS] [SEP]
        query_wordpiece = ['[CLS]'] + self.tokenizer.tokenize(
            self.data[idx]['query_text']) + ['[SEP]']
        query_len = len(query_wordpiece)
        
        doc_id = self.data[idx]['doc_id']
        # 建立 doc 的 token
        # 要記得只能取前 max_seq_len 個單字，要不然會爆掉
        doc_wordpiece = self.tokenizer.tokenize(
            self.data[idx]['doc_text'])[:(self.max_seq_len-query_len-1)] + ['[SEP]']
        doc_len = len(doc_wordpiece)
        
        wordpiece = query_wordpiece + doc_wordpiece
        tokens_tensor = torch.tensor(
            self.tokenizer.convert_tokens_to_ids(wordpiece), dtype=torch.long
        )
        
        segments_tensor = torch.tensor(
            [0]*query_len + [1]*doc_len, dtype=torch.long
        )
        
        masks_tensor = torch.tensor(
            [1]*(query_len+doc_len), dtype=torch.long
        )
        
        return query_id, doc_id, tokens_tensor, segments_tensor, masks_tensor

In [None]:
def eval_collate_fn(data):
    query_id, doc_id, tokens_tensor, segments_tensor, masks_tensor = zip(*data)
    
    query_id = list(query_id)
    doc_id = list(doc_id)
    tokens_tensor = rnn_utils.pad_sequence(list(tokens_tensor), batch_first=True)
    segments_tensor = rnn_utils.pad_sequence(list(segments_tensor), batch_first=True)
    masks_tensor = rnn_utils.pad_sequence(list(masks_tensor), batch_first=True)
    
    return query_id, doc_id, tokens_tensor, segments_tensor, masks_tensor

In [None]:
eval_set = EvalDataset(
    data=dev_data, 
    tokenizer=tokenizer,
    max_seq_len=max_seq_len
)
eval_loader = DataLoader(
    dataset=eval_set,
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=eval_collate_fn,
    num_workers=num_workers
)

print('Num Eval Data: ', len(eval_set))

* ### 定義預測函數

In [None]:
def get_predict_result(data_loader, data_df, model, device):
    predict_result = {int(key):{} for key in data_df['query_id'].to_numpy()}
    
    model.eval()   # 驗證
    loop = tqdm(enumerate(data_loader), total=len(data_loader))
    for batch_idx, (query_id, doc_id, tokens_tensor, segments_tensor, masks_tensor) in loop:
        
        tokens_tensor = tokens_tensor.to(device)
        segments_tensor = segments_tensor.to(device)
        masks_tensor = masks_tensor.to(device)

        # 驗證時不用計算及更新梯度
        with torch.no_grad():
            '''
            TODO: 
                完成模型的前向傳播，並拿到模型預測出的分數
            '''
            outputs = model(input_ids=tokens_tensor, token_type_ids=segments_tensor, attention_mask=masks_tensor)[0]### 1. insert_missing_code ###
            
            for i in range(len(query_id)):
                predict_result[query_id[i]][doc_id[i]] = float(outputs[i][1].cpu().numpy())
        
        # 設定 tqdm 要顯示的東西
        loop.set_description(f"[Predict]")
    
    return predict_result

* ### 進行預測

In [None]:
# 載入預訓練好的參數
checkpoint = torch.load(os.path.join(weights_dir, 'checkpoint_1.pth'), map_location=device)
model.load_state_dict(checkpoint)
model.to(device)

bert_predict_result = get_predict_result(
    data_loader=eval_loader,
    data_df=dev_data_df,
    model=model, 
    device=device
)

* ### 拿到答案

In [None]:
ground_true = {}

for i in range(len(dev_data_df)):
    row = dev_data_df.iloc[i]
    query_id = int(row['query_id'])
    pos_docs = row['pos_doc_ids'].split()
    
    ground_true[query_id] = pos_docs
    
print('Num Answers: ', len(ground_true))

* ### 拿到 BM25 的預測分數

In [None]:
bm25_predict_result = {int(key):{} for key in dev_data_df['query_id'].to_numpy()}

for i in range(len(dev_data_df)):
    row = dev_data_df.iloc[i]
    query_id = int(row['query_id'])
    bm25_top_1000_doc_ID = row['bm25_top1000_docs'].split()
    bm25_top_1000_doc_Score = row['bm25_top1000_scores'].split()
    
    for i in range(len(bm25_top_1000_doc_ID)):
        score = float(bm25_top_1000_doc_Score[i])
        if np.isnan(score):
            bm25_predict_result[query_id][bm25_top_1000_doc_ID[i]] = 0
        else:
            bm25_predict_result[query_id][bm25_top_1000_doc_ID[i]] = score
    
print('Num BM25 Results: ', len(bm25_predict_result))

* ### 把 BM25, BERT 預測結果結合再一起，並進行重新排序

In [None]:
def get_ranking_result(alpha, bm25_predict_result, bert_predict_result):
    combine_predict_result = {}

    for query_id in bm25_predict_result:
        bm25_scores = []
        bert_scores = []
        doc_ids = list(bm25_predict_result[query_id].keys())

        for doc_id in doc_ids:
            bm25_scores.append(bm25_predict_result[query_id][doc_id])
            bert_scores.append(bert_predict_result[query_id][doc_id])

        bm25_scores = np.array(bm25_scores)
        bert_scores = np.array(bert_scores)

        '''
        TODO: 
            把 BM25 跟 BERT 的分數結合在一起，BM25 + (α·Bert)
        '''
        final_score = bm25_scores + alpha * bert_scores ## 1. insert_missing_code ###
        rank = final_score.argsort()[::-1]

        docs_rerank_result = []
        for i in rank:
            docs_rerank_result.append(doc_ids[i])

        combine_predict_result[query_id] = docs_rerank_result
        
    return combine_predict_result

In [None]:
!pip install ml_metrics

* ### 計算重新排序後的 MAP 分數

In [None]:

from ml_metrics import mapk
def get_MAP(ground_true, ranking_result):
    '''
    TODO: 
        完成 MAP 分數計算

        ground_true: 標準答案
        ranking_result: BM25 + (α·Bert) 的預測結果

    '''
    MAP = mapk(list(ground_true.values())[0], list(ranking_result.values())[0], k = 1000)### 1. insert_missing_code ###
    return MAP

* ### 找出最佳的 BERT Weight Alpha

In [None]:
def find_best_alpha(bm25_predict_result, bert_predict_result, ground_true):
    
    best_alpha = 0
    best_MAP = 0

    for alpha in tqdm(np.arange(0.0, 10.0, 0.01)):
        ranking_result = get_ranking_result(
            alpha=alpha, 
            bm25_predict_result=bm25_predict_result, 
            bert_predict_result=bert_predict_result
        )
        MAP = get_MAP(ground_true=ground_true, ranking_result=ranking_result)

        if MAP > best_MAP:
            best_alpha = alpha
            best_MAP = MAP
            
    return best_alpha, best_MAP

In [None]:
best_alpha, best_MAP = find_best_alpha(
    bm25_predict_result=bm25_predict_result, 
    bert_predict_result=bert_predict_result,
    ground_true=ground_true
)

print('Best Alpha: ', best_alpha)
print('Best MAP: ', best_MAP)

In [None]:
test_data_df = pd.read_csv(test_csv_path)

test_data = []

for i in range(len(test_data_df)):
    row = test_data_df.iloc[i]
    #print(row)
    for doc_id in row['bm25_top1000_docs'].split():
        test_data.append({
            'query_id': row['query_id'],
            'query_text': row['query_text'],
            'doc_id': doc_id,
            'doc_text': documents_dict[doc_id]
        })
print(test_data[0]) 
print('Num test Data: ', len(test_data))

In [None]:
test_set = EvalDataset(
    data=test_data, 
    tokenizer=tokenizer,
    max_seq_len=max_seq_len
)
test_loader = DataLoader(
    dataset=test_set,
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=eval_collate_fn,
    num_workers=num_workers
)

print('Num Test Data: ', len(test_set))

In [None]:
bert_predict_result = get_predict_result(
    data_loader=test_loader,
    data_df=test_data_df,
    model=model, 
    device=device
)

In [None]:
bm25_predict_result = {int(key):{} for key in test_data_df['query_id'].to_numpy()}

for i in range(len(test_data_df)):
    row = test_data_df.iloc[i]
    query_id = int(row['query_id'])
    bm25_top_1000_doc_ID = row['bm25_top1000_docs'].split()
    bm25_top_1000_doc_Score = row['bm25_top1000_scores'].split()
    
    for i in range(len(bm25_top_1000_doc_ID)):
        score = float(bm25_top_1000_doc_Score[i])
        if np.isnan(score):
            bm25_predict_result[query_id][bm25_top_1000_doc_ID[i]] = 0
        else:
            bm25_predict_result[query_id][bm25_top_1000_doc_ID[i]] = score
    
print('Num BM25 Results: ', len(bm25_predict_result))

In [None]:
combined_prediction = get_ranking_result(4.25, bm25_predict_result, bert_predict_result)
type(combined_prediction)
list(combined_prediction.items())

In [None]:
str_list = []
for i, j in combined_prediction.items():
  s = ""
  s = s + str(i) + ","
  for d in j:
    s = s + d + " "
  str_list.append(s)
str_list

In [None]:
out_path = "/content/drive/MyDrive/IR_hw6/submission.csv"
f = open(out_path, "w")
f.write("Query,RetrievedDocuments\n")
for str1 in str_list:
    f.write(str1[:-1])
    f.write("\n")