## 論文閱讀問答機器人 - 索引階段 Index Phase
步驟一: 索引階段 Index Phase

In [2]:
# import platform

# system_type = platform.system()

target_dir = './paper'
data_dir = './data'
chunk_size = 300

### 解壓縮 GPT-4 的論文檔案

In [None]:
# Windows
!rmdir /s /q paper
!mkdir -p paper
!tar -xzvf "arXiv-2303.08774v6.tar.gz" -C ./paper

In [16]:
%%bash

# Linux
rm -rf ./paper
mkdir -p paper
tar -xzvf "arXiv-2303.08774v6.tar.gz" -C ./paper

### 1. 拜訪資料夾中所有的 Latex 文件 (.tex) 

In [6]:
import os

def iter_tex(data_dir):
    for dir_path, _, file_list in os.walk(data_dir):
        for file_name in file_list:
            if not file_name.endswith(".tex"):
                continue
            full_path = os.path.join(dir_path, file_name)
            yield full_path

# 迭代並打印每個 .tex 檔案的完整路徑
for full_path in iter_tex(target_dir):
    print(full_path)

./paper\main.tex
./paper\safety.tex


### 2. 讀取每份文件的內容並且開始切割區塊 (Chunk)
其實這個切 Chunk 的步驟有相當多細節可以講究，但這邊我們先簡單依照換行符號進行切割即可，並且將排版用的雙空格換成單空格。

In [3]:
def get_segments(full_path):
    with open(full_path, "rt", encoding="UTF-8") as fp:
        text = fp.read().strip()
        while "  " in text:
            text = text.replace("  ", " ")
        return text.split("\n")

# 迭代並打印每個 .tex 檔案的切割好的 Chunk 
for full_path in iter_tex(target_dir):
    segments = get_segments(full_path)
    print(segments)


['\\documentclass{article}', '\\PassOptionsToPackage{numbers,compress}{natbib}', '', '\\usepackage[final]{neurips_2021}', '', '\\usepackage[utf8]{inputenc} %', '\\usepackage[T1]{fontenc} %', '\\usepackage[hidelinks]{hyperref} %', '\\usepackage{url} %', '\\usepackage{booktabs} %', '\\usepackage{amsfonts} %', '\\usepackage{nicefrac} %', '\\usepackage{microtype} %', '\\usepackage{xcolor} %', '\\usepackage{graphicx}', '\\usepackage{longtable}', '\\usepackage{caption}', '\\usepackage{mdframed}', '\\usepackage{subcaption}', '\\usepackage{multirow}', '\\usepackage{placeins}', '\\usepackage{multicol}', '\\usepackage{makecell}', '\\usepackage[normalem]{ulem} %', '\\usepackage{wrapfig}', '\\usepackage[percent]{overpic}', '\\usepackage{lipsum}', '\\usepackage{csquotes}', '\\usepackage[OT2,T1]{fontenc}', '\\usepackage[english]{babel}', '\\usepackage{devanagari}', '\\usepackage{tablefootnote}', '\\usepackage{pdfpages}', '\\captionsetup[table]{skip=8pt}', '', '\\usepackage{macros}', '', '\\newmdenv[

### 3. 處理 Special Token 問題，並計算每個區塊有多少 Tokens
* 使用 tiktoken 套件提供的 Tokenizer 來計算
* 因為在的論文中，剛好用到了 Tokenizer 內建的 Special Token，如果直接 Encode 會跳錯誤，所以我們要設定 `disallowed_special=()` 的參數
    * 這個機制是為了避免使用者透過 **提示注入攻擊 (Prompt Injection)** 來「越獄」語言模型的設計。
    * 因為語言模型通常都對這種 Special Token 非常敏感，當 Special Token 的出現不正常時，就有可能破壞語言模型原本的行為。


#### 了解整篇論文共有多少 token

In [None]:
import os
import tiktoken

tk = tiktoken.get_encoding("cl100k_base")

for full_path in iter_tex(target_dir):
    with open(full_path, "rt", encoding="UTF-8") as fp:
        txt = fp.read()
        print(len(tk.encode(txt, disallowed_special=())))

31309
2156


#### 比較差異 (Special Token)

In [4]:
import tiktoken
import traceback

tk = tiktoken.get_encoding("cl100k_base")

try:
    print(tk.encode("<|endofprompt|>"))
    # ValueError: Encountered text corresponding to disallowed special token.
except Exception as e:
    print(str(e))
    # traceback.print_exc()

print(tk.encode("<|endofprompt|>", disallowed_special=()))
# 當作一般文字來編碼 - [27, 91, 408, 1073, 41681, 91, 29]

print(tk.encode("<|endofprompt|>", allowed_special="all"))
# 當作特殊 Token 來編碼 - [100276]

Encountered text corresponding to disallowed special token '<|endofprompt|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endofprompt|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endofprompt|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.

[27, 91, 408, 1073, 41681, 91, 29]
[100276]


#### 計算文本 Token 數量

In [5]:
def calc_tokens(tk: tiktoken.Encoding, seg: str):
    # disallowed_special=() 會將整份文本都當成一般文字
    # 不會將任何 Token 當成 Special Token
    tokens = tk.encode(seg, disallowed_special=())
    return len(tokens)


for full_path in iter_tex(target_dir):
    segments = get_segments(full_path)
    segments = [[calc_tokens(tk, seg), seg] for seg in segments]
    print(segments)

[[6, '\\documentclass{article}'], [13, '\\PassOptionsToPackage{numbers,compress}{natbib}'], [0, ''], [13, '\\usepackage[final]{neurips_2021}'], [0, ''], [11, '\\usepackage[utf8]{inputenc} %'], [10, '\\usepackage[T1]{fontenc} %'], [11, '\\usepackage[hidelinks]{hyperref} %'], [7, '\\usepackage{url} %'], [8, '\\usepackage{booktabs} %'], [8, '\\usepackage{amsfonts} %'], [8, '\\usepackage{nicefrac} %'], [8, '\\usepackage{microtype} %'], [7, '\\usepackage{xcolor} %'], [7, '\\usepackage{graphicx}'], [7, '\\usepackage{longtable}'], [6, '\\usepackage{caption}'], [8, '\\usepackage{mdframed}'], [7, '\\usepackage{subcaption}'], [7, '\\usepackage{multirow}'], [7, '\\usepackage{placeins}'], [7, '\\usepackage{multicol}'], [7, '\\usepackage{makecell}'], [11, '\\usepackage[normalem]{ulem} %'], [7, '\\usepackage{wrapfig}'], [9, '\\usepackage[percent]{overpic}'], [7, '\\usepackage{lipsum}'], [7, '\\usepackage{csquotes}'], [12, '\\usepackage[OT2,T1]{fontenc}'], [8, '\\usepackage[english]{babel}'], [9, '\\

段落非常零散！

1. 零散的 Chunk 會造成上下文語意被嚴重截斷，效果通常不會很好。
    * 將零散的小 Chunk 合併為一個大的 Chunk，但又不會大到模型塞不下。
2. 如何決定一個 Chunk 有多大呢？
    * 根據**模型能力與可負擔的成本**而有變化。 
    * 另外太多或太長的 Chunk 模型未必處理的來，有時反而會因為雜訊太多而影響回答的品質。
    * 筆者希望每個 Request 平均只需要消耗 2000 個 Tokens :
        * 預留 500 Tokens 給模型輸出。
        * 剩下 1500 給搜尋到的 Chunks 分。
        * 假設我們每次取 5 個 Chunks 來用。這樣每個 Chunk 就是 1500 / 5 = 300 個 Tokens 可用。


#### 按照剛剛算出來的 Chunk Size 來設計區塊合併的演算法

In [6]:
def process_segments(segments: list[tuple[int, str]], chunk_size):
    print(f"Original Segments: {len(segments)}")
    i = 0
    while i + 1 < len(segments):
        # 取得當前 Chunk 與下個 Chunk 的長度與內容
        seg1_len, seg1_txt = segments[i]
        seg2_len, seg2_txt = segments[i + 1]

        # 若兩個 Chunk 長度相加小於 chunk_size 則合併
        if seg1_len + seg2_len < chunk_size:
            segments[i][0] = seg1_len + seg2_len
            segments[i][1] = seg1_txt + "\n" + seg2_txt
            segments.pop(i + 1)  # 移除已被合併的 Chunk

        # 若 Chunk Size 超過上限則開始處理下一個
        else:
            i += 1
    print(f"Processed Segments: {len(segments)}")
    return [seg[1].strip() for seg in segments]


for full_path in iter_tex(target_dir):
    segments = get_segments(full_path)
    segments = [[calc_tokens(tk, seg), seg] for seg in segments]
    segments = process_segments(segments, chunk_size)
    print(segments)

Original Segments: 1248
Processed Segments: 117
['\\documentclass{article}\n\\PassOptionsToPackage{numbers,compress}{natbib}\n\n\\usepackage[final]{neurips_2021}\n\n\\usepackage[utf8]{inputenc} %\n\\usepackage[T1]{fontenc} %\n\\usepackage[hidelinks]{hyperref} %\n\\usepackage{url} %\n\\usepackage{booktabs} %\n\\usepackage{amsfonts} %\n\\usepackage{nicefrac} %\n\\usepackage{microtype} %\n\\usepackage{xcolor} %\n\\usepackage{graphicx}\n\\usepackage{longtable}\n\\usepackage{caption}\n\\usepackage{mdframed}\n\\usepackage{subcaption}\n\\usepackage{multirow}\n\\usepackage{placeins}\n\\usepackage{multicol}\n\\usepackage{makecell}\n\\usepackage[normalem]{ulem} %\n\\usepackage{wrapfig}\n\\usepackage[percent]{overpic}\n\\usepackage{lipsum}\n\\usepackage{csquotes}\n\\usepackage[OT2,T1]{fontenc}\n\\usepackage[english]{babel}\n\\usepackage{devanagari}\n\\usepackage{tablefootnote}\n\\usepackage{pdfpages}\n\\captionsetup[table]{skip=8pt}\n\n\\usepackage{macros}\n\n\\newmdenv[\n font=\\ttfamily\\small,

### 4. 觀察處理完的 Chunk 是否正確

In [12]:
# Windows
!rmdir /s /q segments
!mkdir segments

In [None]:
%%bash

# Linux
rm -rf segments
mkdir -p segments

In [17]:
# 用來取得隨機檔名
from tempfile import NamedTemporaryFile as NTF

def dump_segments(segments):
    with NTF("wt", dir="./segments", delete=False) as fp:
        print(fp.name)
        for i, seg in enumerate(segments):
            fp.write(f"=== Chunk {i} Begin ===\n")
            fp.write(f"{seg}\n")
            fp.write(f"=== Chunk {i} End ===\n\n")

chunks = list()
for full_path in iter_tex(target_dir):
    segments = get_segments(full_path)
    segments = [[calc_tokens(tk, seg), seg] for seg in segments]
    segments = process_segments(segments, chunk_size)
    chunks.extend(segments)
    
    dump_segments(segments)

print(chunks)
dump_segments(chunks)

Original Segments: 1248
Processed Segments: 117
d:\Documents\學習\LLM\project\latex_paper_qa\segments\tmp6n7mjib8
Original Segments: 117
Processed Segments: 8
d:\Documents\學習\LLM\project\latex_paper_qa\segments\tmpryqyio7d
['\\documentclass{article}\n\\PassOptionsToPackage{numbers,compress}{natbib}\n\n\\usepackage[final]{neurips_2021}\n\n\\usepackage[utf8]{inputenc} %\n\\usepackage[T1]{fontenc} %\n\\usepackage[hidelinks]{hyperref} %\n\\usepackage{url} %\n\\usepackage{booktabs} %\n\\usepackage{amsfonts} %\n\\usepackage{nicefrac} %\n\\usepackage{microtype} %\n\\usepackage{xcolor} %\n\\usepackage{graphicx}\n\\usepackage{longtable}\n\\usepackage{caption}\n\\usepackage{mdframed}\n\\usepackage{subcaption}\n\\usepackage{multirow}\n\\usepackage{placeins}\n\\usepackage{multicol}\n\\usepackage{makecell}\n\\usepackage[normalem]{ulem} %\n\\usepackage{wrapfig}\n\\usepackage[percent]{overpic}\n\\usepackage{lipsum}\n\\usepackage{csquotes}\n\\usepackage[OT2,T1]{fontenc}\n\\usepackage[english]{babel}\n\\

### 5. 將這些切好的 Chunks 投入 OpenAI Embedding API

In [None]:
import numpy as np

import openai
from dotenv import load_dotenv

load_dotenv()  # .env

def create_embeddings(chunks):
    resp = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=chunks,
    )
    embs = [item["embedding"] for item in resp["data"]]
    embs = np.array(embs)

    print(f"Embedding Shape: {embs.shape}")

    return embs

### 6. 將 Chunks 與 Embeddings 存起來，完成我們的索引階段

In [None]:
import json

def dump_data(chunks, embs, data_dir):
    with open(f"{data_dir}/chunks.json", "wt", encoding="UTF-8") as fp:
        json.dump(chunks, fp, ensure_ascii=False)
    np.save(f"{data_dir}/embs.npy", embs)

embs = create_embeddings(chunks)
dump_data(chunks, embs, data_dir)