# Baseline
python: 3.8.*

Download our starter pack (3~5 min)

In [None]:
cd /content

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
### 第一次跑要下載資料到Google Drive


# !git clone https://github.com/s109703023/NCKU-AICUP2023-baseline.git
# !cd /content/drive/MyDrive/NCKU-AICUP2023-baseline/
# !gdown --folder 1tdf_i6XNYT6jpOA6F_lqU4mRRh1xYPcl
# !mv baseline/* ./

In [None]:
cd /content/drive/MyDrive/NCKU-AICUP2023-baseline

In [None]:
ls 

In [None]:
### 確定資料夾在NCKU、裡面有/data這樣

### 直接寫ls 才會藍藍的 (?)

In [None]:
ls data

In [None]:
%pip install -r requirements.txt

In [None]:
cd NCKU-AICUP2023-baseline

notebook1
## PART 1. Document retrieval
難度：★★★★☆

### Document retrieval 流程
- Step 1: 使用 Constituency Parser 找出 claim 中的 Noun Phrases (NPs）
- Step 2: 從 Wikipedia API 中取出和 NP 相符合的頁面名稱
- Step 3: 保留出現在句子 index 最靠前的五篇文章作為相關文章

Prepare the environment and import all library we need

In [None]:
# built-in libs
import json
import pickle
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple, Union

# 3rd party libs
import hanlp
import opencc
import pandas as pd
import wikipedia
from hanlp.components.pipeline import Pipeline
from pandarallel import pandarallel

# our own libs
from utils import load_json

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=10)
wikipedia.set_lang("zh")

Preload the data.

In [None]:
TRAIN_DATA = load_json("data/public_train.jsonl")
TEST_DATA = load_json("data/public_test.jsonl")
CONVERTER_T2S = opencc.OpenCC("t2s.json")
CONVERTER_S2T = opencc.OpenCC("s2t.json")

Data class for type hinting

In [None]:
@dataclass
class Claim:
    data: str

@dataclass
class AnnotationID:
    id: int

@dataclass
class EvidenceID:
    id: int

@dataclass
class PageTitle:
    title: str

@dataclass
class SentenceID:
    id: int

@dataclass
class Evidence:
    data: List[List[Tuple[AnnotationID, EvidenceID, PageTitle, SentenceID]]]

### Helper function

For the sake of consistency, we convert traditional to simplified Chinese first before converting it back to traditional Chinese.  This is due to some errors occuring when converting traditional to traditional Chinese.

In [None]:
def do_st_corrections(text: str) -> str:
    simplified = CONVERTER_T2S.convert(text)

    return CONVERTER_S2T.convert(simplified)

We use constituency parsing to separate part of speeches or so called constituent to extract noun phrases.  In the later stages, we will use the noun phrases as the query to search for relevant documents.  

![con](https://www.baeldung.com/wp-content/uploads/sites/4/2020/06/constituency_parse_tree-1.png)

##### 現在正在調這裡

In [None]:
def get_nps_hanlp(
    predictor: Pipeline,
    d: Dict[str, Union[int, Claim, Evidence]],
) -> List[str]:
    claim = d["claim"]
    tree = predictor(claim)["con"]
    nps = [
        do_st_corrections("".join(subtree.leaves()))
        for subtree in tree.subtrees(lambda t: t.label() == "NP")
    ]

    return nps

In [None]:
## add: 增加filter，刪除list中重複的元素（刪掉短的元素）
def filter_nps(nps:List[str])-> List[str]:
    n = len(nps)
    if n==0:
        return []
    result = [nps[0]]
    for s in nps:
        if s not in result[len(result)-1]:
            result.append(s)
    return result

In [None]:
import re
def filter_nps(nps:List[str], bound:int)-> List[str]:
    n = len(nps)
    if n==0:
        return []
    result = [nps[0]]
    for i in range(1,n):
        if len(nps[i])>=bound:
            result.append(nps[i])
        else:
            if any(nps[i] in s for s in result):
                continue
            result.append(re.sub(r'[^\w\s]', '', nps[i]))
    return result

In [None]:
## testt
list1 = ['天衛三軌道在天王星內部的磁層', '天衛三軌道', '天衛', '軌道', '天王星內部', '磁層', '《仲夏夜之夢》作者緹坦妮雅', '《仲夏夜之夢》作者', '《仲夏夜之夢》', '仲夏夜', '夢', '作者', '緹坦妮雅']
print(filter_nps(list1,5))

#### 評估 Document Retrieval

\begin{align*}
\textrm{Precision}&=\frac{\textrm{relevant} \cap \textrm{retrieved}}{\textrm{retrieved}} \\
&=\frac{\textrm{抽取到的文章與正確文章的交集}}{\textrm{抽取到的文章}}
\end{align*}

\begin{align*}
\textrm{Recall}&=\frac{\textrm{relevant} \cap \textrm{retrieved}}{\textrm{relevant}} \\
&=\frac{\textrm{抽取到的文章與正確文章的交集}}{\textrm{正確文章}}
\end{align*}

<img src="https://i.imgur.com/1q3c6Zw.png" alt="example" width="60%">

In [None]:
def calculate_precision(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
) -> None:
    precision = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        # Extract all ground truth of titles of the wikipedia pages
        # evidence[2] refers to the title of the wikipedia page
        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])

        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        if len(predicted_pages) != 0:
            precision += len(hits) / len(predicted_pages)

        count += 1

    # Macro precision
    print(f"Precision: {precision / count}")


def calculate_recall(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
) -> None:
    recall = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])
        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        recall += len(hits) / len(gt_pages)
        count += 1

    print(f"Recall: {recall / count}")

The default amount of documents retrieved is at most five documents.  This `num_pred_doc` can be adjusted based on your objective.  Save data in jsonl format.

In [None]:
def save_doc(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
    mode: str = "train",
    num_pred_doc: int = 5,
) -> None:
    with open(
        f"data/{mode}_doc{num_pred_doc}.jsonl",
        "w",
        encoding="utf8",
    ) as f:
        for i, d in enumerate(data):
            d["predicted_pages"] = list(predictions.iloc[i])
            f.write(json.dumps(d, ensure_ascii=False) + "\n")

### Main function for document retrieval

In [None]:
def get_pred_pages(series_data: pd.Series) -> Set[Dict[int, str]]:
    results = []
    tmp_muji = []
    # wiki_page: its index showned in claim
    mapping = {}
    claim = series_data["claim"]
    nps = series_data["hanlp_results"]
    first_wiki_term = []

    for i, np in enumerate(nps):
        # Simplified Traditional Chinese Correction
        wiki_search_results = [
            do_st_corrections(w) for w in wikipedia.search(np)
        ]

        # Remove the wiki page's description in brackets
        wiki_set = [re.sub(r"\s\(\S+\)", "", w) for w in wiki_search_results]
        wiki_df = pd.DataFrame({
            "wiki_set": wiki_set,
            "wiki_results": wiki_search_results
        })

        # Elements in wiki_set --> index
        # Extracting only the first element is one way to avoid extracting
        # too many of the similar wiki pages
        grouped_df = wiki_df.groupby("wiki_set", sort=False).first()
        candidates = grouped_df["wiki_results"].tolist()
        # muji refers to wiki_set
        muji = grouped_df.index.tolist()

        for prefix, term in zip(muji, candidates):
            if prefix not in tmp_muji:
                matched = False

                # Take at least one term from the first noun phrase
                if i == 0:
                    first_wiki_term.append(term)

                # Walrus operator :=
                # https://docs.python.org/3/whatsnew/3.8.html#assignment-expressions
                # Through these filters, we are trying to figure out if the term
                # is within the claim
                if (((new_term := term) in claim) or
                    ((new_term := term.replace("·", "")) in claim) or
                    ((new_term := term.split(" ")[0]) in claim) or
                    ((new_term := term.replace("-", " ")) in claim)):
                    matched = True

                elif "·" in term:
                    splitted = term.split("·")
                    for split in splitted:
                        if (new_term := split) in claim:
                            matched = True
                            break

                if matched:
                    # post-processing
                    term = term.replace(" ", "_")
                    term = term.replace("-", "")
                    results.append(term)
                    mapping[term] = claim.find(new_term)
                    tmp_muji.append(new_term)

    # 7 is a hyperparameter
    par = 7
    if len(results) > par:
        assert -1 not in mapping.values()
        results = sorted(mapping, key=mapping.get)[:par]
    elif len(results) < 1:
        results = first_wiki_term

    return set(results)

#### Groupby pages
- 如果有多個後綴類型的pages，只取第一個配對到的
![groupby](https://imgur.com/FRE4BVv.png)

### Step 1. Get noun phrases from hanlp consituency parsing tree

Setup [HanLP](https://github.com/hankcs/HanLP) predictor (1 min)

In [None]:
predictor = (hanlp.pipeline().append(
    hanlp.load("FINE_ELECTRA_SMALL_ZH"),
    output_key="tok",
).append(
    hanlp.load("CTB9_CON_ELECTRA_SMALL"),
    output_key="con",
    input_key="tok",
))

We will skip this process which for creating parsing tree when demo on class

##### 刪


In [None]:
hanlp_file = f"data/hanlp_con_results.pkl"
if Path(hanlp_file).exists():
    with open(hanlp_file, "rb") as f:
        hanlp_results = pickle.load(f)
else:
    hanlp_results = [get_nps_hanlp(predictor, d) for d in TRAIN_DATA]
    # hanlp_results = [filter_nps(get_nps_hanlp(predictor, d),5) for d in TRAIN_DATA]
    with open(hanlp_file, "wb") as f:
        pickle.dump(hanlp_results, f)

Get pages via wiki online api

In [None]:
doc_path = f"data/train_doc5.jsonl"
if Path(doc_path).exists():
    with open(doc_path, "r", encoding="utf8") as f:
        predicted_results = pd.Series([
            set(json.loads(line)["predicted_pages"])
            for line in f
        ])
else:
    train_df = pd.DataFrame(TRAIN_DATA)
    train_df.loc[:, "hanlp_results"] = hanlp_results
    predicted_results = train_df.apply(get_pred_pages, axis=1)
    save_doc(TRAIN_DATA, predicted_results, mode="train")

In [None]:
## test
for d in TRAIN_DATA[:10]:
  print(get_nps_hanlp(predictor, d))
  print(filter_nps(get_nps_hanlp(predictor, d),5))
  print()

### Step 2. Calculate our results

##### 結果！！

<table>
    <thead>
        <tr>
<th> No. </th><th> Version </th><th> Precision </th><th> Recall </th><th> 說明 </th>
        </tr>
    </thead>
    <tbody>
        <tr>
<td>1.</td><td>原始版本</td><td>0.25093</td><td>0.80733</td><td></td>
        </tr><tr>
<td>2.</td><td>filter_nps</td><td>0.41740</td><td>0.76800</td><td>濾掉所有短的重複字串</td>
        </tr><tr>
<td>3.</td><td>filter_nps</td><td>0.37931</td><td>0.80656</td><td>保留長度五個字以上的字串</td>
        </tr>
    </tbody>
</table>

In [None]:
calculate_precision(TRAIN_DATA, predicted_results)
calculate_recall(TRAIN_DATA, predicted_results)

### Step 3. Repeat the same process on test set
Create parsing tree

In [None]:
hanlp_test_file = f"data/hanlp_con_test_results.pkl"
if Path(hanlp_test_file).exists():
    with open(hanlp_file, "rb") as f:
        hanlp_results = pickle.load(f)
else:
    hanlp_results = [get_nps_hanlp(predictor, d) for d in TEST_DATA]
    # hanlp_results = [filter_nps(get_nps_hanlp(predictor, d)) for d in TEST_DATA]
    with open(hanlp_file, "wb") as f:
        pickle.dump(hanlp_results, f)

Get pages via wiki online api

In [None]:
test_doc_path = f"data/test_doc5.jsonl"
if Path(test_doc_path).exists():
    with open(test_doc_path, "r", encoding="utf8") as f:
        test_results = pd.Series(
            [set(json.loads(line)["predicted_pages"]) for line in f])
else:
    test_df = pd.DataFrame(TEST_DATA)
    test_df.loc[:, "hanlp_results"] = hanlp_results
    test_results = test_df.parallel_apply(get_pred_pages, axis=1)
    save_doc(TEST_DATA, test_results, mode="test")

notebook2
## PART 2. Sentence retrieval
難度：★★★☆☆ (最好 ／‵Д′)／~ ╧╧ )


### 選擇Training Data (依照Part1 生成的1,2,3)

In [None]:
# local libs
from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

In [None]:
Use_Data_From_Part_1 = "3"  #@param {type:"string"}

In [None]:
### Trainning Data From Part1

### always in form : data/train_doc5_data{x}.jsonl

File_From_Part_1 = f"data/train_doc5_data{Use_Data_From_Part_1}.jsonl"  

print("Use Training Data ",File_From_Part_1)

DOC_DATA = load_json(File_From_Part_1)   


## 缺檔案時報錯



### Sentence retrieval 前置作業

內含:

* 調 Train/Dev 比例

* 轉換Wiki的code (3min)




使用方式 : 

- 蓋起來直接執行就好

In [None]:
# built-in libs
from pathlib import Path
from typing import Dict, List, Set, Tuple, Union

# third-party libs
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler,
)

from dataset import BERTDataset, Dataset



pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=10)

Global variable

In [None]:
SEED = 42

TRAIN_DATA = load_json("data/public_train.jsonl")
TEST_DATA = load_json("data/public_test.jsonl")

 
LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

_y = [LABEL2ID[data["label"]] for data in TRAIN_DATA]
# GT means Ground Truth
TRAIN_GT, DEV_GT = train_test_split(
    DOC_DATA,
    test_size=0.2,
    random_state=SEED,
    shuffle=True,
    stratify=_y,
)


Preload wiki database (1 min)

In [None]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages)
del wiki_pages

### Helper function

內含:

* 調整input樣子的code (是否要擴增資料集)

使用方式:

* 蓋起來直接執行

In [None]:
import json

In [None]:
def evidence_macro_precision(
    instance: Dict,
    top_rows: pd.DataFrame,
) -> Tuple[float, float]:
    """Calculate precision for sentence retrieval
    This function is modified from fever-scorer.
    https://github.com/sheffieldnlp/fever-scorer/blob/master/src/fever/scorer.py

    Args:
        instance (dict): a row of the dev set (dev.jsonl) of test set (test.jsonl)
        top_rows (pd.DataFrame): our predictions with the top probabilities

        IMPORTANT!!!
        instance (dict) should have the key of `evidence`.
        top_rows (pd.DataFrame) should have a column `predicted_evidence`.

    Returns:
        Tuple[float, float]:
        [1]: relevant and retrieved (numerator of precision)
        [2]: retrieved (denominator of precision)
    """
    this_precision = 0.0
    this_precision_hits = 0.0

    # Return 0, 0 if label is not enough info since not enough info does not
    # contain any evidence.
    if instance["label"].upper() != "NOT ENOUGH INFO":
        # e[2] is the page title, e[3] is the sentence index
        all_evi = [[e[2], e[3]]
                   for eg in instance["evidence"]
                   for e in eg
                   if e[3] is not None]
        claim = instance["claim"]
        predicted_evidence = top_rows[top_rows["claim"] ==
                                      claim]["predicted_evidence"].tolist()

        for prediction in predicted_evidence:
            if prediction in all_evi:
                this_precision += 1.0
            this_precision_hits += 1.0

        return (this_precision /
                this_precision_hits) if this_precision_hits > 0 else 1.0, 1.0

    return 0.0, 0.0

Calculate recall for sentence retrieval

In [None]:
def evidence_macro_recall(
    instance: Dict,
    top_rows: pd.DataFrame,
) -> Tuple[float, float]:
    """Calculate recall for sentence retrieval
    This function is modified from fever-scorer.
    https://github.com/sheffieldnlp/fever-scorer/blob/master/src/fever/scorer.py

    Args:
        instance (dict): a row of the dev set (dev.jsonl) of test set (test.jsonl)
        top_rows (pd.DataFrame): our predictions with the top probabilities

        IMPORTANT!!!
        instance (dict) should have the key of `evidence`.
        top_rows (pd.DataFrame) should have a column `predicted_evidence`.

    Returns:
        Tuple[float, float]:
        [1]: relevant and retrieved (numerator of recall)
        [2]: relevant (denominator of recall)
    """
    # We only want to score F1/Precision/Recall of recalled evidence for NEI claims
    if instance["label"].upper() != "NOT ENOUGH INFO":
        # If there's no evidence to predict, return 1
        if len(instance["evidence"]) == 0 or all(
            [len(eg) == 0 for eg in instance]):
            return 1.0, 1.0

        claim = instance["claim"]

        predicted_evidence = top_rows[top_rows["claim"] ==
                                      claim]["predicted_evidence"].tolist()

        for evidence_group in instance["evidence"]:
            evidence = [[e[2], e[3]] for e in evidence_group]
            if all([item in predicted_evidence for item in evidence]):
                # We only want to score complete groups of evidence. Incomplete
                # groups are worthless.
                return 1.0, 1.0
        return 0.0, 1.0
    return 0.0, 0.0

Calculate the scores of sentence retrieval

In [None]:
def evaluate_retrieval(
    probs: np.ndarray,
    df_evidences: pd.DataFrame,
    ground_truths: pd.DataFrame,
    top_n: int = 5,
    cal_scores: bool = True,
    save_name: str = None,
    shut_up = False
) -> Dict[str, float]:
    """Calculate the scores of sentence retrieval

    Args:
        probs (np.ndarray): probabilities of the candidate retrieved sentences
        df_evidences (pd.DataFrame): the candiate evidence sentences paired with claims
        ground_truths (pd.DataFrame): the loaded data of dev.jsonl or test.jsonl
        top_n (int, optional): the number of the retrieved sentences. Defaults to 2.

    Returns:
        Dict[str, float]: F1 score, precision, and recall
    """
    df_evidences["prob"] = probs
    top_rows = (
        df_evidences.groupby("claim").apply(
        lambda x: x.nlargest(top_n, "prob"))
        .reset_index(drop=True)
    )
    if shut_up==False:
      print("top_rows=")
      for i in range(30):
        print(top_rows["claim"][i],top_rows["text"][i])
        # print(top_rows["claim"][i],top_rows["text"][i],top_rows["evidence"][i])
        # for j in top_rows["evidence"][i]:

    if cal_scores:
        macro_precision = 0
        macro_precision_hits = 0
        macro_recall = 0
        macro_recall_hits = 0

        for i, instance in enumerate(ground_truths):
            macro_prec = evidence_macro_precision(instance, top_rows)
            macro_precision += macro_prec[0]
            macro_precision_hits += macro_prec[1]

            macro_rec = evidence_macro_recall(instance, top_rows)
            macro_recall += macro_rec[0]
            macro_recall_hits += macro_rec[1]

        pr = (macro_precision /
              macro_precision_hits) if macro_precision_hits > 0 else 1.0
        rec = (macro_recall /
               macro_recall_hits) if macro_recall_hits > 0 else 0.0
        f1 = 2.0 * pr * rec / (pr + rec)

    if save_name is not None:
        # write doc7_sent5 file
        with open(f"data/{save_name}", "w") as f:
            for instance in ground_truths:
                claim = instance["claim"]
                predicted_evidence = top_rows[
                    top_rows["claim"] == claim]["predicted_evidence"].tolist()
                instance["predicted_evidence"] = predicted_evidence
                f.write(json.dumps(instance, ensure_ascii=False) + "\n")

    if cal_scores:
        return {"F1 score": f1, "Precision": pr, "Recall": rec}

Inference script to get probabilites for the candidate evidence sentences

In [None]:
def get_predicted_probs(
    model: nn.Module,
    dataloader: Dataset,
    device: torch.device,
) -> np.ndarray:
    """Inference script to get probabilites for the candidate evidence sentences

    Args:
        model: the one from HuggingFace Transformers
        dataloader: devset or testset in torch dataloader

    Returns:
        np.ndarray: probabilites of the candidate evidence sentences
    """
    model.eval()
    probs = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            probs.extend(torch.softmax(logits, dim=1)[:, 1].tolist())

    return np.array(probs)

SentRetrievalBERTDataset class for the paired sentences

Please refer to [PyTorch Dataset tutorial](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

In [None]:
class SentRetrievalBERTDataset(BERTDataset):
    """SentRetrievalBERTDataset class for AICUP dataset with top-k evidence sentences."""

    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        sentA = item["claim"]
        sentB = item["text"]

        # claim [SEP] text
        concat = self.tokenizer(
            sentA,
            sentB,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
        )
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}
        if "label" in item:
            concat_ten["labels"] = torch.tensor(item["label"])

        return concat_ten

In [None]:
import random

def pair_with_wiki_sentences(
    mapping: Dict[str, Dict[int, str]],
    df: pd.DataFrame,
    negative_ratio: float,
) -> pd.DataFrame:
    """Only for creating train sentences."""
    claims = []
    sentences = []
    labels = []

    # positive
    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue

        claim = df["claim"].iloc[i]
        evidence_sets = df["evidence"].iloc[i]
        for evidence_set in evidence_sets:
            sents = []
            for evidence in evidence_set:
                # evidence[2] is the page title
                page = evidence[2].replace(" ", "_")
                # the only page with weird name
                if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                    continue
                # evidence[3] is in form of int however, mapping requires str
                sent_idx = str(evidence[3])
                
                #sents.append(mapping[page][sent_idx])

                if mapping[page][sent_idx]!="":
                  claims.append(claim)
                  sentences.append(mapping[page][sent_idx]) ### 所有相關句合再一起
                  labels.append(1)

                  sents.append(mapping[page][sent_idx])

                  # 用,分隔 (做出更多dataset)
                  for i in mapping[page][sent_idx].split(","): 
                    claims.append(claim)
                    sentences.append(i) ### 所有相關句合再一起
                    labels.append(1)

                  # for i in mapping[page][sent_idx].split("。"):
                  #   claims.append(claim)
                  #   sentences.append(i) ### 所有相關句合再一起
                  #   labels.append(1)

            ### Random 合成句子 (做出更多dataset)
            escape_rate=0.2
            dataset_set=set()

            while True:
              if random.random() < escape_rate:
                break
              
              composed_sentence=[]
              

              for sub_sentence in sents:
                if random.random() >=0.5:
                  composed_sentence.append(sub_sentence)
              
              if len(composed_sentence)!=0:
                try_sentence="".join(composed_sentence)
                if try_sentence not in dataset_set:
                  dataset_set.add(try_sentence)
                  # random.shuffle(composed_sentence)
                  # target_sentence="".join()
                  claims.append(claim)
                  sentences.append(try_sentence) ### 所有相關句合再一起
                  labels.append(1)

              escape_rate=escape_rate + random.random()*0.08

                

            #whole_evidence = " ".join(sents)

            # claims.append(claim)
            # sentences.append(whole_evidence) ### 所有相關句合再一起
            # labels.append(1)

    # negative
    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue
        claim = df["claim"].iloc[i]

        evidence_set = set([(evidence[2], evidence[3])
                            for evidences in df["evidence"][i]
                            for evidence in evidences])
        predicted_pages = df["predicted_pages"][i]
        for page in predicted_pages:
            page = page.replace(" ", "_")
            try:
                page_sent_id_pairs = [
                    (page, sent_idx) for sent_idx in mapping[page].keys()
                ]
            except KeyError:
                # print(f"{page} is not in our Wiki db.")
                continue

            for pair in page_sent_id_pairs:
                if pair in evidence_set:
                    continue
                text = mapping[page][pair[1]]
                # `np.random.rand(1) <= 0.05`: Control not to add too many negative samples
                if text != "" and np.random.rand(1) <= negative_ratio:
                    claims.append(claim)
                    sentences.append(text)
                    labels.append(0)

    return pd.DataFrame({"claim": claims, "text": sentences, "label": labels})


def pair_with_wiki_sentences_eval(
    mapping: Dict[str, Dict[int, str]],
    df: pd.DataFrame,
    is_testset: bool = False,
) -> pd.DataFrame:
    """Only for creating dev and test sentences."""
    claims = []
    sentences = []
    evidence = []
    predicted_evidence = []

    # negative
    for i in range(len(df)):
        # if df["label"].iloc[i] == "NOT ENOUGH INFO":
        #     continue
        claim = df["claim"].iloc[i]

        predicted_pages = df["predicted_pages"][i]
        for page in predicted_pages:
            page = page.replace(" ", "_")
            try:
                page_sent_id_pairs = [(page, k) for k in mapping[page]]
            except KeyError:
                # print(f"{page} is not in our Wiki db.")
                continue

            for page_name, sentence_id in page_sent_id_pairs:
                text = mapping[page][sentence_id]
                if text != "":
                    claims.append(claim)
                    sentences.append(text)
                    if not is_testset:
                        evidence.append(df["evidence"].iloc[i])
                    predicted_evidence.append([page_name, int(sentence_id)])

    return pd.DataFrame({
        "claim": claims,
        "text": sentences,
        "evidence": evidence if not is_testset else None,
        "predicted_evidence": predicted_evidence,
    })

In [None]:
def pair_with_wiki_sentences_by_jerry(
    mapping: Dict[str, Dict[int, str]],
    df: pd.DataFrame,
    negative_ratio: float,
) -> pd.DataFrame:
    """Only for creating train sentences."""
    claims = []
    sentences = []
    labels = []

    # positive
    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue

        claim = df["claim"].iloc[i]
        evidence_sets = df["evidence"].iloc[i]
        for evidence_set in evidence_sets:
            sents = []
            for evidence in evidence_set:
                # evidence[2] is the page title
                page = evidence[2].replace(" ", "_")
                # the only page with weird name
                if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                    continue
                # evidence[3] is in form of int however, mapping requires str
                sent_idx = str(evidence[3])
                sents.append(mapping[page][sent_idx])

            whole_evidence = " ".join(sents)

            claims.append(claim)
            sentences.append(whole_evidence)
            labels.append(1)

    # negative
    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue
        claim = df["claim"].iloc[i]

        evidence_set = set([(evidence[2], evidence[3])
                            for evidences in df["evidence"][i]
                            for evidence in evidences])
        predicted_pages = df["predicted_pages"][i]
        for page in predicted_pages:
            page = page.replace(" ", "_")
            try:
                page_sent_id_pairs = [
                    (page, sent_idx) for sent_idx in mapping[page].keys()
                ]
            except KeyError:
                # print(f"{page} is not in our Wiki db.")
                continue

            for pair in page_sent_id_pairs:
                if pair in evidence_set:
                    continue
                text = mapping[page][pair[1]]
                # `np.random.rand(1) <= 0.05`: Control not to add too many negative samples
                if text != "" and np.random.rand(1) <= negative_ratio:
                    claims.append(claim)
                    sentences.append(text)
                    labels.append(0)

    return pd.DataFrame({"claim": claims, "text": sentences, "label": labels})


def pair_with_wiki_sentences_eval(
    mapping: Dict[str, Dict[int, str]],
    df: pd.DataFrame,
    is_testset: bool = False,
) -> pd.DataFrame:
    """Only for creating dev and test sentences."""
    claims = []
    sentences = []
    evidence = []
    predicted_evidence = []

    # negative
    for i in range(len(df)):
        # if df["label"].iloc[i] == "NOT ENOUGH INFO":
        #     continue
        claim = df["claim"].iloc[i]

        predicted_pages = df["predicted_pages"][i]
        for page in predicted_pages:
            page = page.replace(" ", "_")
            try:
                page_sent_id_pairs = [(page, k) for k in mapping[page]]
            except KeyError:
                # print(f"{page} is not in our Wiki db.")
                continue

            for page_name, sentence_id in page_sent_id_pairs:
                text = mapping[page][sentence_id]
                if text != "":
                    claims.append(claim)
                    sentences.append(text)
                    if not is_testset:
                        evidence.append(df["evidence"].iloc[i])
                    predicted_evidence.append([page_name, int(sentence_id)])

    return pd.DataFrame({
        "claim": claims,
        "text": sentences,
        "evidence": evidence if not is_testset else None,
        "predicted_evidence": predicted_evidence,
    })

### 1-1. Train By Bert : 超參數


Hyperparams

In [None]:
MODEL_NAME = "bert-base-chinese"  #@param {type:"string"}
NUM_EPOCHS = 20  #@param {type:"integer"}
LR = 1e-7  #@param {type:"number"}
TRAIN_BATCH_SIZE = 64  #@param {type:"integer"}
TEST_BATCH_SIZE = 64  #@param {type:"integer"}
NEGATIVE_RATIO = 0.058  #@param {type:"number"}
VALIDATION_STEP = 300  #@param {type:"integer"}
TOP_N = 5  #@param {type:"integer"}

### 填入yes(要擴)/no
EXTEND_OR_NOT = "yes" #@param {type:"string"} 


In [None]:
if EXTEND_OR_NOT == "yes": ### 要資料擴充的跑這邊 (會比較久)
  train_df = pair_with_wiki_sentences(
      mapping,
      pd.DataFrame(TRAIN_GT),
      NEGATIVE_RATIO,
  )
  counts = train_df["label"].value_counts()
  print("Now using the following train data with 0 (Negative) and 1 (Positive)")
  print(counts)

  dev_evidences = pair_with_wiki_sentences_eval(mapping, pd.DataFrame(DEV_GT))

else:   ### 不要資料擴充的跑這邊
  train_df = pair_with_wiki_sentences_by_jerry(
      mapping,
      pd.DataFrame(TRAIN_GT),
      NEGATIVE_RATIO,
  )
  counts = train_df["label"].value_counts()
  print("Now using the following train data with 0 (Negative) and 1 (Positive)")
  print(counts)

  dev_evidences = pair_with_wiki_sentences_eval(mapping, pd.DataFrame(DEV_GT))

### 請確定0,1比例正常 (by 調整Negative Rate)

### 1-2.Train By Bert : 前置

直接執行

In [None]:
EXP_DIR = f"sent_retrieval/e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_neg{NEGATIVE_RATIO}_top{TOP_N}_data{Use_Data_From_Part_1}_{EXTEND_OR_NOT}"
LOG_DIR = "logs/" + EXP_DIR
CKPT_DIR = "checkpoints/" + EXP_DIR

HANG_MODEL_NAME= f"e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_neg{NEGATIVE_RATIO}_top{TOP_N}"

if not Path(LOG_DIR).exists():
    Path(LOG_DIR).mkdir(parents=True)

if not Path(CKPT_DIR).exists():
    Path(CKPT_DIR).mkdir(parents=True)

print("儲存路徑 : ",EXP_DIR)

Dataloader things. Please refer to [PyTorch Dataset tutorial](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = SentRetrievalBERTDataset(train_df, tokenizer=tokenizer)
val_dataset = SentRetrievalBERTDataset(dev_evidences, tokenizer=tokenizer)

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)
eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)

Save your memory.

In [None]:
del train_df

Trainer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
print("Use Device : ",device)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = set_lr_scheduler(optimizer, num_training_steps)

writer = SummaryWriter(LOG_DIR)

For training, please refer to [PyTorch Optimization tutorial](https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html)

Please make sure that you are using gpu when training (5 min)

### 1-3.Train By Bert : Training

每隔Validate會在/NCKU/checkpoints/Sent裡面放入模型

Q:要規定只能選最好嗎?

In [None]:
progress_bar = tqdm(range(num_training_steps))
current_steps = 0

best_v_acc = 0.7

print("Epoch : ",NUM_EPOCHS)
print("There are ",len(train_dataloader)," batchs in one epoch")
print(" ")

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        writer.add_scalar("training_loss", loss.item(), current_steps)

        y_pred = torch.argmax(outputs.logits, dim=1).tolist()
        y_true = batch["labels"].tolist()

        current_steps += 1

        if current_steps % VALIDATION_STEP == 0 and current_steps > 0:
            print("Start validation")
            probs = get_predicted_probs(model, eval_dataloader, device)

            val_results = evaluate_retrieval(
                probs=probs,
                df_evidences=dev_evidences,
                ground_truths=DEV_GT,
                top_n=TOP_N,
                shut_up=True ### 如果不想讓他輸出很多 shut_up=True
            )
            print(val_results)

            # log each metric separately to TensorBoard
            for metric_name, metric_value in val_results.items():
                writer.add_scalar(
                    f"dev_{metric_name}",
                    metric_value,
                    current_steps,
                )

            if val_results["Recall"]>best_v_acc:
              best_v_acc=val_results["Recall"]
              save_checkpoint(model, CKPT_DIR, current_steps)
              print("[New Record] at ",current_steps)

print("Finished training!")

### 1-4.Train By Bert : 統整Acc與幫資料存檔

除非要生成給Part3 的Train與Validate

不然不要執行


In [None]:
print("CKPT_DIR=",CKPT_DIR)

In [None]:
### 可以改動選Model的資料夾 ### Like checkpoints/sent_retrieval/資料夾名

# CKPT_DIR = "checkpoints/sent_retrieval/e20_bs64_1e-05_neg0.057_top5_data3_yes"  #@param {type:"string"}


In [None]:
### 決定給Part3的檔案名稱

MODEL_PARA=CKPT_DIR.split("/")
MODEL_PARA=MODEL_PARA[-1]
print("Model 參數: ",MODEL_PARA)


### 如果是中途插入，要填入參數

# TOP_N = 5  #@param {type:"integer"}
# Use_Data_From_Part_1 = "3"  #@param {type:"string"}

###


print("儲存檔名(Train) : ",f"train_doc5sent{TOP_N}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl")
print("儲存檔名(Valid) : ",f"dev_doc5sent{TOP_N}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl")

In [None]:
## 如果是中途插入Model (不Train) ，要先跑建立Model的Code


MODEL_NAME = "bert-base-chinese"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("使用裝置 : ",device)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)


In [90]:
ls checkpoints/sent_retrieval/e20_bs64_1e-05_neg0.057_top5_data3_yes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
model.1000.pt  model.1600.pt  model.250.pt  model.600.pt
model.1200.pt  model.1800.pt  model.500.pt


In [92]:
#選Validate Model


ckpt_name = "model.1800.pt"  #@param {type:"string"}
model = load_model(model, ckpt_name, CKPT_DIR)
print("Start final evaluations and write prediction files.")



Start final evaluations and write prediction files.


In [93]:

import json

train_evidences = pair_with_wiki_sentences_eval(
    mapping=mapping,
    df=pd.DataFrame(TRAIN_GT),
)
train_set = SentRetrievalBERTDataset(train_evidences, tokenizer)
train_dataloader = DataLoader(train_set, batch_size=TEST_BATCH_SIZE)

print("Start calculating training scores")
probs = get_predicted_probs(model, train_dataloader, device)
train_results = evaluate_retrieval(
    probs=probs,
    df_evidences=train_evidences,
    ground_truths=TRAIN_GT,
    top_n=TOP_N,
    save_name=f"train_doc5sent{TOP_N}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl",
    shut_up=True
)
print(f"Training scores => {train_results}")

dev_evidences = pair_with_wiki_sentences_eval(mapping, pd.DataFrame(DEV_GT))

val_dataset = SentRetrievalBERTDataset(dev_evidences, tokenizer=tokenizer)

eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)


print("Start validation")
probs = get_predicted_probs(model, eval_dataloader, device)
val_results = evaluate_retrieval(
    probs=probs,
    df_evidences=dev_evidences,
    ground_truths=DEV_GT,
    top_n=TOP_N,
    save_name=f"dev_doc5sent{TOP_N}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl",
    shut_up=True
)

print(f"Validation scores => {val_results}")

Start calculating training scores


  0%|          | 0/14 [00:00<?, ?it/s]

Training scores => {'F1 score': 0.0018042384929349957, 'Precision': 0.9990970654627539, 'Recall': 0.0009029345372460496}
Start validation


  0%|          | 0/13 [00:00<?, ?it/s]

Validation scores => {'F1 score': 0.005998759732188201, 'Precision': 0.9977537103890894, 'Recall': 0.0030084235860409147}


### 1-5.Train By Bert : Testing and Make File

In [None]:
import json

In [None]:
ls data

In [None]:
### 選Part 1 執行結果的檔案
### 改右邊就好
### 不要/data

Data_From_Part_1_Test = "2"  #@param {type:"string"}

file_name= f"test_doc5_data{Data_From_Part_1_Test}.jsonl"

print("Use file ",file_name)

test_data = load_json("data/"+file_name)

In [None]:
# 如果Model路徑不慎滿意可以調

DIR_FOR_MODEL="checkpoints/sent_retrieval/e20_bs64_2e-05_neg0.03_top5_data2" #@param {type:"string"}
CKPT_DIR = DIR_FOR_MODEL

# 並配合 LS 服用

!ls "checkpoints/sent_retrieval/e20_bs64_2e-05_neg0.12_top5/"

### 決定給Part3的檔案名稱

MODEL_PARA=CKPT_DIR.split("/")
MODEL_PARA=MODEL_PARA[-1]
print("Model 參數: ",MODEL_PARA)


### 如果是中途插入，要填入參數

# TOP_N = 5  #@param {type:"integer"}
# Use_Data_From_Part_1 = "3"  #@param {type:"string"}

###


print("儲存檔名(Test) : ",f"test_doc5sent{TOP_N}_data{Data_From_Part_1_Test}_{MODEL_PARA}.jsonl")

In [None]:
### 選擇Model

ckpt_name = "model.10.pt"  #@param {type:"string"}
model = load_model(model, ckpt_name, CKPT_DIR)


In [None]:
import json


test_evidences = pair_with_wiki_sentences_eval(
    mapping,
    pd.DataFrame(test_data),
    is_testset=True,
)
test_set = SentRetrievalBERTDataset(test_evidences, tokenizer)
test_dataloader = DataLoader(test_set, batch_size=TEST_BATCH_SIZE)

print("Start predicting the test data")
probs = get_predicted_probs(model, test_dataloader, device)
evaluate_retrieval(
    probs=probs,
    df_evidences=test_evidences,
    ground_truths=test_data,
    top_n=TOP_N,
    cal_scores=False,
    save_name=f"test_doc5sent{TOP_N}_data{Data_From_Part_1_Test}_{MODEL_PARA}.jsonl",
    shut_up=True
)

In [None]:
print("Saved name : ",f"test_doc5sent{TOP_N}_data{Data_From_Part_1_Test}_{MODEL_PARA}.jsonl")

notebook3
## PART 3. Claim verification
難度：★★☆☆☆

### 選擇Training Data (依照Part1 生成的1,2,3)

In [None]:
# local libs
from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

In [None]:
# 配合 LS 服用

!ls "checkpoints/sent_retrieval/e20_bs64_2e-05_neg0.03_top5_data2"

In [None]:
Use_Data_From_Part_1 = "3"  #@param {type:"string"}

Par2_Number_of_Sent = "5"  #@param {type:"string"}

Part2_Model_Path = "checkpoints/sent_retrieval/e20_bs64_2e-05_neg0.03_top5_data2"  #@param {type:"string"}

CKPT_DIR = Part2_Model_Path


### 決定給Part3的檔案名稱

MODEL_PARA=CKPT_DIR.split("/")
MODEL_PARA=MODEL_PARA[-1]
print("Model 參數: ",MODEL_PARA)

###


print("使用檔名(Train) : ",f"data/train_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl")
print("使用檔名(Valid) : ",f"data/dev_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl")

In [None]:
### Trainning Data From Part1

### always in form : data/train_doc5_data{x}.jsonl

Train_File_From_Part_2 = f"data/train_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl"  
Valid_File_From_Part_2 = f"data/dev_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl" 

Train_Pkl_From_Part_2 = f"data/train_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.pkl"
Valid_Pkl_From_Part_2 = f"data/dev_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.pkl"

TRAIN_DATA = load_json(Train_File_From_Part_2)
DEV_DATA = load_json(Valid_File_From_Part_2)

TRAIN_PKL_FILE = Path(Train_Pkl_From_Part_2)
DEV_PKL_FILE = Path(Valid_Pkl_From_Part_2)

## 缺檔案時報錯

### 前置
- Step 1: 將證據句和 claim 丟入 BERT 判斷 Supports/Refutes/Not Enough Info。

內含wiki轉換(3min)

直接執行

In [None]:
import pickle
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm

import torch
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler,
)

from dataset import BERTDataset


pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

Global variables

In [None]:
SEED = 42

LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}



In [None]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages,)
del wiki_pages

### mapping : {
  #  "page_name":{
      # "第n句":"文字"
  #  } 
###}

### Helper function

內可選資料集擴充與否

直接執行

In [None]:
class AicupTopkEvidenceBERTDataset(BERTDataset):
    """AICUP dataset with top-k evidence sentences."""

    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        claim = item["claim"]
        evidence = item["evidence_list"]

        # In case there are less than topk evidence sentences
        pad = ["[PAD]"] * (self.topk - len(evidence))  ### 補齊For Trainning data?
        if type(evidence)==str:
          evidence=[evidence]

        evidence += pad
        concat_claim_evidence = " [SEP] ".join([claim,"[SEP]","[SEP]", *evidence])

        # if idx==2:
        #   print(concat_claim_evidence)

        concat = self.tokenizer(
            concat_claim_evidence,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
        )
        label = LABEL2ID[item["label"]] if "label" in item else -1
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}

        if "label" in item:
            concat_ten["labels"] = torch.tensor(label)

        return concat_ten

Evaluation function

In [None]:
def run_evaluation(model: torch.nn.Module, dataloader: DataLoader, device):
    model.eval()

    loss = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            y_true.extend(batch["labels"].tolist())

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss += outputs.loss.item()
            logits = outputs.logits
            y_pred.extend(torch.argmax(logits, dim=1).tolist())

    acc = accuracy_score(y_true, y_pred)

    return {"val_loss": loss / len(dataloader), "val_acc": acc}

Prediction

In [None]:
def run_predict(model: torch.nn.Module, test_dl: DataLoader, device) -> list:
    model.eval()

    preds = []
    for batch in tqdm(test_dl,
                      total=len(test_dl),
                      leave=False,
                      desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(**batch).logits
        pred = torch.argmax(pred, dim=1)
        preds.extend(pred.tolist())
    return preds

In [None]:
def join_with_topk_evidence(
    df: pd.DataFrame,
    mapping: dict,
    mode: str = "train",
    topk: int = 5,
) -> pd.DataFrame:
    """join_with_topk_evidence join the dataset with topk evidence.

    Note:
        After extraction, the dataset will be like this:
               id     label         claim                           evidence            evidence_list
        0    4604  supports       高行健...     [[[3393, 3552, 高行健, 0], [...  [高行健 （ ）江西赣州出...
        ..    ...       ...            ...                                ...                     ...
        945  2095  supports       美國總...  [[[1879, 2032, 吉米·卡特, 16], [...  [卸任后 ， 卡特積極參與...
        停各种战争及人質危機的斡旋工作 ， 反对美国小布什政府攻打伊拉克...

        [946 rows x 5 columns]

    Args:
        df (pd.DataFrame): The dataset with evidence.
        wiki_pages (pd.DataFrame): The wiki pages dataframe
        topk (int, optional): The topk evidence. Defaults to 5.
        cache(Union[Path, str], optional): The cache file path. Defaults to None.
            If cache is None, return the result directly.

    Returns:
        pd.DataFrame: The dataset with topk evidence_list.
            The `evidence_list` column will be: List[str]
    """

    # format evidence column to List[List[Tuple[str, str, str, str]]]
    if "evidence" in df.columns:
        df["evidence"] = df["evidence"].parallel_map(
            lambda x: [[x]] if not isinstance(x[0], list) else [x]
            if not isinstance(x[0][0], list) else x) ### to 3層list

    print(f"Extracting evidence_list for the {mode} mode ...")
    if mode == "eval":
        # extract evidence
        df["evidence_list"] = df["predicted_evidence"].parallel_map(lambda x: " ".join([
            mapping.get(evi_id, {}).get(str(evi_idx), "")
            for evi_id, evi_idx in x  # for each evidence list
        ][:topk]) if isinstance(x, list) else "")  ### 做出[前topk句]
        print(df["evidence_list"][:5])
    else:
        # extract evidence
        df["evidence_list"] = df["evidence"].parallel_map(lambda x: [
            " ".join([  # join evidence
                mapping.get(evi_id, {}).get(str(evi_idx), "")
                for _, _, evi_id, evi_idx in evi_list
            ]) if isinstance(evi_list, list) else ""
            for evi_list in x  # for each evidence list
        ][:1] if isinstance(x, list) else [])

    return df

### 1-1.Train By Bert : 超參數 

Hyperparams

In [None]:
MODEL_NAME = "bert-base-chinese"  #@param {type:"string"}
TRAIN_BATCH_SIZE = 16  #@param {type:"integer"}
TEST_BATCH_SIZE = 16  #@param {type:"integer"}
LR = 1e-5  #@param {type:"number"}
NUM_EPOCHS = 20  #@param {type:"integer"}
MAX_SEQ_LEN = 256  #@param {type:"integer"}
EVIDENCE_TOPK = 5  #@param {type:"integer"}
VALIDATION_STEP = 25  #@param {type:"integer"}


Experiment Directory

### 1-2.Train By Bert : input

In [None]:


EXP_DIR = f"claim_verification/e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_top{EVIDENCE_TOPK}_data{Use_Data_From_Part_1}"
LOG_DIR = "logs/" + EXP_DIR
CKPT_DIR = "checkpoints/" + EXP_DIR

if not Path(LOG_DIR).exists():
    Path(LOG_DIR).mkdir(parents=True)

if not Path(CKPT_DIR).exists():
    Path(CKPT_DIR).mkdir(parents=True)

In [None]:
if not TRAIN_PKL_FILE.exists():
    train_df = join_with_topk_evidence(
        pd.DataFrame(TRAIN_DATA),
        mapping,
        topk=EVIDENCE_TOPK,
    )
    train_df.to_pickle(TRAIN_PKL_FILE, protocol=4)
else:
    with open(TRAIN_PKL_FILE, "rb") as f:
        train_df = pickle.load(f)

if not DEV_PKL_FILE.exists():
    dev_df = join_with_topk_evidence(
        pd.DataFrame(DEV_DATA),
        mapping,
        mode="eval",
        topk=EVIDENCE_TOPK,
    )
    dev_df.to_pickle(DEV_PKL_FILE, protocol=4)
else:
    with open(DEV_PKL_FILE, "rb") as f:
        dev_df = pickle.load(f)

### 其中train_df的"evidence_list"會合併為一句
### dev_df不會，會是[top_n句]


### 每一個Model因為都有一個新的Trainning Model，所以要對應一個新的PKL

In [None]:
counts = train_df["label"].value_counts()
print("Now using the following train data with 0 (Negative) and 1 (Positive)")
print(counts)

In [None]:
torch.cuda.empty_cache()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = AicupTopkEvidenceBERTDataset(
    train_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)
val_dataset = AicupTopkEvidenceBERTDataset(
    dev_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)
eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)

### Step 3. Training

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL2ID),
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = set_lr_scheduler(optimizer, num_training_steps)

writer = SummaryWriter(LOG_DIR)

Training (30 mins)

In [None]:
progress_bar = tqdm(range(num_training_steps))
current_steps = 0

strict=False

if strict:
    best_val_acc=0.5
else :
    best_val_acc=0.4

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        writer.add_scalar("training_loss", loss.item(), current_steps)

        y_pred = torch.argmax(outputs.logits, dim=1).tolist()
        y_true = batch["labels"].tolist()

        current_steps += 1

        if current_steps % VALIDATION_STEP == 0 and current_steps > 0:
            print("Start validation")
            val_results = run_evaluation(model, eval_dataloader, device)

            # log each metric separately to TensorBoard
            for metric_name, metric_value in val_results.items():
                print(f"{metric_name}: {metric_value}")
                writer.add_scalar(f"{metric_name}", metric_value, current_steps)

            if val_results['val_acc'] > best_val_acc:
                best_val_acc=val_results['val_acc'] 
                save_checkpoint(
                    model,
                    CKPT_DIR,
                    current_steps,
                    mark=f"val_acc={val_results['val_acc']:.4f}",
                )

print("Finished training!")

### Step 4. Make your submission

In [None]:
OUTPUT_FILENAME = "submission.jsonl" #@param {type:"string"}


#### 存檔名稱

In [None]:
Use_Data_From_Part_1 = "3"  #@param {type:"string"}

Par2_Number_of_Sent = "5"  #@param {type:"string"}

Part2_Model_Path = "checkpoints/sent_retrieval/e20_bs64_2e-05_neg0.03_top5_data2"  #@param {type:"string"}

CKPT_DIR = Part2_Model_Path


### 決定給Part3的檔案名稱

MODEL_PARA=CKPT_DIR.split("/")
MODEL_PARA=MODEL_PARA[-1]
print("Model 參數: ",MODEL_PARA)

###


print("使用檔名(Test) : ",f"data/test_doc5sent{Par2_Number_of_Sent}_data{Use_Data_From_Part_1}_{MODEL_PARA}.jsonl")

In [None]:
### 選 Part 2 出來的資料集
TEST_DATA = load_json("data/test_doc5sent5_data3.jsonl") #@param {type:"string"}

### 蛤
TEST_PKL_FILE = Path("data/test_doc5sent5.pkl")

In [None]:

if not TEST_PKL_FILE.exists():
    test_df = join_with_topk_evidence(
        pd.DataFrame(TEST_DATA),
        mapping,
        mode="eval",
        topk=EVIDENCE_TOPK,
    )
    test_df.to_pickle(TEST_PKL_FILE, protocol=4)
else:
    with open(TEST_PKL_FILE, "rb") as f:
        test_df = pickle.load(f)

test_dataset = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)
test_dataloader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)

In [None]:
print(test_df.shape)
test_df.head()

In [None]:
print(test_df.iloc[0]["evidence_list"])
print(len(test_df.iloc[0]["evidence_list"]))

Prediction

In [None]:
### 選Model的路徑

CKPT_DIR="checkpoints/claim_verification/e20_bs32_7e-05_top5" #@param {type:"string"}

### 可以配合 ls 選Model

In [None]:
ls "checkpoints/claim_verification/e20_bs32_7e-05_top5"

In [None]:
### 選Model名稱與 Testing

ckpt_name = "val_acc=0.5466_model.150.pt"  #@param {type:"string"}
model = load_model(model, ckpt_name, CKPT_DIR)
predicted_label = run_predict(model, test_dataloader, device)

In [None]:
### 存Testing檔

predict_dataset = test_df.copy()
predict_dataset["predicted_label"] = list(map(ID2LABEL.get, predicted_label))
predict_dataset[["id", "predicted_label", "predicted_evidence"]].to_json(
    OUTPUT_FILENAME+"_data3",
    orient="records",
    lines=True,
    force_ascii=False,
)

### Step 5. XGBoost (先略過)

In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
### for XGBoost
from sklearn.decomposition import PCA

def Prepare_XGBoost_Data(dataframe):
    """做資料"""

    concate=[]

    for i in range(dataframe.shape[0]):
      data="[CLS] "+dataframe["claim"][i]+" [SEP] "+dataframe["evidence_list"][i]
      concate.append(data)
      
    dataframe["concate"]=concate


def Go_Through_SBERT(dataframe,dimention): 
    """做資料"""
    # Load a pre-trained Chinese SBERT model
    model = SentenceTransformer('bert-base-chinese')    
    sentences=dataframe["concate"]
    embeddings = model.encode(sentences)

    # return embeddings,np.array(dataframe["label"])

    pca = PCA(n_components=dimention)
    embeddings_reduced = pca.fit_transform(embeddings)

    return embeddings_reduced , np.array(dataframe["label"])

def Go_Through_SBERT_test(dataframe): 
    """做資料"""
    # Load a pre-trained Chinese SBERT model
    model = SentenceTransformer('bert-base-chinese')    
    sentences=dataframe["concate"]
    embeddings = model.encode(sentences)

    return embeddings

In [None]:
if not TRAIN_PKL_FILE.exists():
    train_df = join_with_topk_evidence(
        pd.DataFrame(TRAIN_DATA),
        mapping,
        topk=EVIDENCE_TOPK,
    )
    train_df.to_pickle(TRAIN_PKL_FILE, protocol=4)
else:
    with open(TRAIN_PKL_FILE, "rb") as f:
        train_df = pickle.load(f)

if not DEV_PKL_FILE.exists():
    dev_df = join_with_topk_evidence(
        pd.DataFrame(DEV_DATA),
        mapping,
        mode="eval",
        topk=EVIDENCE_TOPK,
    )
    dev_df.to_pickle(DEV_PKL_FILE, protocol=4)
else:
    with open(DEV_PKL_FILE, "rb") as f:
        dev_df = pickle.load(f)

### 其中train_df的"evidence_list"會合併為一句
### dev_df不會，會是[top_n句]

In [None]:
print(train_df.shape)
print(dev_df.shape)

In [None]:
result = train_df["evidence_list"].map(lambda x: x[0])
train_df["evidence_list"]=result
train_df.head()["evidence_list"]

In [None]:
### Make Piece of XGBoost data

Prepare_XGBoost_Data(train_df)

train_df.head()
###

In [None]:
### Make Piece of XGBoost data

new_df=train_df.iloc[:].reset_index(drop=True)


X_train,Y_train=Go_Through_SBERT(new_df,350)
print(type(X_train),type(Y_train))
print(X_train.shape,Y_train.shape)
Sbert_df=pd.DataFrame(X_train)

print(X_train[0,:10])

###

In [None]:
print(train_df["label"].unique())

In [None]:
temp_y=pd.Series(Y_train)
temp_y=temp_y.map({'NOT ENOUGH INFO':0,'refutes':1,'supports':2})
# print(temp_y[:20])
temp_y.to_csv("Part_3_trainning_3_y.csv")

Y_train=temp_y.values

In [None]:
print(Y_train)

In [None]:
### Make Piece of XGBoost data

Sbert_df.to_csv("Part_3_trainning_3_x.csv")

###

In [None]:
from xgboost import XGBClassifier

# 建立XGBClassifier模型
xgboostModel = XGBClassifier(n_estimators=150, learning_rate= 0.2,max_depth=5) #max_depth
# 使用訓練資料訓練模型
xgboostModel.fit(X_train, Y_train)


In [None]:
print('訓練集: ',xgboostModel.score(X_train,Y_train))

In [None]:
dev_df.iloc[2]["evidence_list"]

In [None]:
result = dev_df["evidence_list"].map(lambda x: " ".join(x))
dev_df["evidence_list"]=result
# dev_df.head()["evidence_list"]
print(dev_df.iloc[:10]["evidence_list"].values)

In [None]:
### Make Piece of XGBoost data

Prepare_XGBoost_Data(dev_df)

dev_df.head()
###

In [None]:
### Make Piece of XGBoost data

new_df=dev_df.iloc[:].reset_index(drop=True)


X_test,Y_test=Go_Through_SBERT(new_df,350)
print(X_test.shape,Y_test.shape)
Sbert_df=pd.DataFrame(X_test)

print(X_test[0,:10])

###

In [None]:
temp_y=pd.Series(Y_test)
temp_y=temp_y.map({'NOT ENOUGH INFO':0,'refutes':1,'supports':2})
# print(temp_y[:20])
temp_y.to_csv("Part_3_testing_3_y.csv")

Y_test=temp_y.values

In [None]:
### Make Piece of XGBoost data

Sbert_df.to_csv("Part_3_testing_3_x.csv")

###

In [None]:
print('訓練集: ',xgboostModel.score(X_test,Y_test))

In [None]:
from xgboost import XGBClassifier
import pickle

best_n_tree=0
best_max_depth=0
best_model="None"
best_test_acc = 0.43

for n_tree in [25,30,40,45,50,75,100,110,125,150,180,210]:
  for depth in [3,4,5,6,7,8]:
    xgboostModel = XGBClassifier(n_estimators=n_tree, learning_rate= 0.2,max_depth=depth) #max_depth
    xgboostModel.fit(X_train, Y_train)

    test_score=xgboostModel.score(X_test,Y_test)

    print('Train: ',xgboostModel.score(X_train,Y_train),' Test: ',test_score, " With n_tree=",n_tree," ;depth=",depth,end="   ")

    if test_score > best_test_acc:
      best_model=xgboostModel
      best_n_tree=n_tree
      best_max_depth=depth
      best_test_acc=test_score
      print("[ New Record ]",end=" ")
      with open('xgboost_model_'+str(n_tree)+'_'+str(depth)+'.pkl', 'wb') as f:
        pickle.dump(xgboostModel, f)
    

    print("")

print("\n\nBest model:")
print("Best n_tree : ", best_n_tree)
print("Best max_depth : ", best_max_depth)

In [None]:
pwd

In [None]:
!ls | grep pkl

In [None]:
# 載入模型

import pickle

with open('xgboost_model_40_5.pkl', 'rb') as f:
    xgboostModel = pickle.load(f)

In [None]:
print(test_df.iloc[0]["evidence_list"].values)
test_df.head()

In [None]:
result = test_df["evidence_list"].map(lambda x: " ".join(x))
test_df["evidence_list"]=result
# dev_df.head()["evidence_list"]
print(test_df.iloc[:10]["evidence_list"].values)

In [None]:
### Make Piece of XGBoost data

Prepare_XGBoost_Data(test_df)

test_df.head()
###

In [None]:
### Make Piece of XGBoost data

new_df=test_df.iloc[:].reset_index(drop=True)


X_test=Go_Through_SBERT_test(test_df)
print(X_test.shape)
Sbert_df=pd.DataFrame(X_test)

print(X_test[0,:10])

###

In [None]:
### Make Piece of XGBoost data

Sbert_df.to_csv("Part_3_testing_3_x_data1.csv")

###

In [None]:
proba = xgboostModel.predict_proba(X_test)
print(proba.shape)
 



In [None]:
print(proba[:10,:])