LIST INDEX OUT OF RANGE

Fixes #8
3Alan · May 3, 2023 · 4c69c6a · 4c69c6a
1 parent 917492e
commit 4c69c6a
Show file tree

Hide file tree

Showing 18 changed files with 1,947 additions and 1,910 deletions.
diff --git a/client/package.json b/client/package.json
@@ -25,7 +25,8 @@
     "query-string": "^8.1.0",
     "react": "^18.2.0",
     "react-dom": "^18.2.0",
-    "react-router-dom": "^6.9.0"
+    "react-router-dom": "^6.9.0",
+    "sass": "^1.62.1"
   },
   "devDependencies": {
     "@types/canvas-confetti": "^1.6.0",

diff --git a/client/src/App.tsx b/client/src/App.tsx
@@ -1,4 +1,4 @@
-import './styles/globals.css';
+import './styles/globals.scss';
 import 'github-markdown-css/github-markdown-light.css';
 import { BrowserRouter, Route, Routes } from 'react-router-dom';
 import routes from './routes';

diff --git a/client/src/styles/globals.css → client/src/styles/globals.scss b/client/src/styles/globals.css → client/src/styles/globals.scss
@@ -30,9 +30,26 @@ body {
   overflow: hidden;
 }
 
-.markdown-body .hl-source {
-  background-color: #eff6ff;
-  border-radius: 6px;
+.markdown-body {
+  table {
+    &.hl-source {
+      border-radius: 0;
+
+      tr, td {
+        background-color: #eff6ff;
+        border-radius: 0;
+      }
+    }
+  }
+
+  .hl-source {
+    background-color: #eff6ff;
+    border-radius: 6px;
+
+    pre {
+      background-color: #eff6ff !important;
+    }
+  }
 }
 
 .markdown-body {

diff --git a/client/yarn.lock b/client/yarn.lock
@@ -1023,7 +1023,7 @@ chalk@^4.0.0:
     ansi-styles "^4.1.0"
     supports-color "^7.1.0"
 
-chokidar@^3.5.3:
+"chokidar@>=3.0.0 <4.0.0", chokidar@^3.5.3:
   version "3.5.3"
   resolved "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd"
   integrity sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==
@@ -1841,6 +1841,11 @@ ignore@^5.2.0:
   resolved "https://registry.npmjs.org/ignore/-/ignore-5.2.4.tgz#a291c0c6178ff1b960befe47fcdec301674a6324"
   integrity sha512-MAb38BcSbH0eHNBxn7ql2NH/kX33OkB3lZ1BNdh7ENeRChHTYsTvWrMubiIAMNS2llXEEgZ1MUOBtXChP3kaFQ==
 
+immutable@^4.0.0:
+  version "4.3.0"
+  resolved "https://registry.npmjs.org/immutable/-/immutable-4.3.0.tgz#eb1738f14ffb39fd068b1dbe1296117484dd34be"
+  integrity sha512-0AOCmOip+xgJwEVTQj1EfiDDOkPmuyllDuTuEX+DDXUgapLAsBIfkg3sxCYyCEA8mQqZrrxPUGjcOQ2JS3WLkg==
+
 import-fresh@^3.0.0, import-fresh@^3.2.1:
   version "3.3.0"
   resolved "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz#37162c25fcb9ebaa2e6e53d5b4d88ce17d9e0c2b"
@@ -3032,6 +3037,15 @@ safe-regex-test@^1.0.0:
     get-intrinsic "^1.1.3"
     is-regex "^1.1.4"
 
+sass@^1.62.1:
+  version "1.62.1"
+  resolved "https://registry.npmjs.org/sass/-/sass-1.62.1.tgz#caa8d6bf098935bc92fc73fa169fb3790cacd029"
+  integrity sha512-NHpxIzN29MXvWiuswfc1W3I0N8SXBd8UR26WntmDlRYf0bSADnwnOjsyMZ3lMezSlArD33Vs3YFhp7dWvL770A==
+  dependencies:
+    chokidar ">=3.0.0 <4.0.0"
+    immutable "^4.0.0"
+    source-map-js ">=0.6.2 <2.0.0"
+
 scheduler@^0.23.0:
   version "0.23.0"
   resolved "https://registry.npmjs.org/scheduler/-/scheduler-0.23.0.tgz#ba8041afc3d30eb206a487b6b384002e4e61fdfe"
@@ -3089,7 +3103,7 @@ slash@^4.0.0:
   resolved "https://registry.npmjs.org/slash/-/slash-4.0.0.tgz#2422372176c4c6c5addb5e2ada885af984b396a7"
   integrity sha512-3dOsAHXXUkQTpOYcoAxLIorMTp4gIQr5IW3iVb7A7lFIp0VHhnynm9izx6TssdrIcVIESAlVjtnO2K8bg+Coew==
 
-source-map-js@^1.0.2:
+"source-map-js@>=0.6.2 <2.0.0", source-map-js@^1.0.2:
   version "1.0.2"
   resolved "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz#adbc361d9c62df380125e7f161f71c826f1e490c"
   integrity sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==

diff --git a/server/app.py b/server/app.py
@@ -146,7 +146,7 @@ def query_index():
 
     res = index.query(query_text, streaming=True)
     cost = embed_model.last_token_usage + llm_predictor.last_token_usage
-    sources = [{"extraInfo": x.extra_info} for x in res.source_nodes]
+    sources = [{"extraInfo": x.node.extra_info} for x in res.source_nodes]
 
     def response_generator():
         yield json.dumps({"cost": cost, "sources": sources})

diff --git a/server/custom_loader.py b/server/custom_loader.py
@@ -2,7 +2,6 @@
 
 import tiktoken
 from bs4 import BeautifulSoup
-from bs4.element import NavigableString
 from llama_index.readers.base import BaseReader
 from llama_index.readers.schema.base import Document
 
@@ -26,149 +25,88 @@ def num_tokens_from_string(string: str, encoding_name: str = "p50k_base") -> int
     return num_tokens
 
 
+def split_text_to_doc(
+    text: str, current_chunk_id, chunk_size: int = 400
+) -> List[Document]:
+    """Split text into chunks of a given size."""
+    chunks = []
+    token_len = num_tokens_from_string(text)
+
+    for i in range(0, token_len, chunk_size):
+        encode_text = encode_string(text)
+        decode_text = decode_string(encode_text[i : i + chunk_size]).strip()
+        chunks.append(
+            Document(
+                decode_text,
+                extra_info={"chunk_id": f"chunk-{current_chunk_id}"},
+            )
+        )
+
+    return chunks
+
+
 class CustomReader(BaseReader):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Init params."""
         super().__init__(*args, **kwargs)
 
     def load_data(self, html, filename) -> List[Document]:
-        # 解析HTML
         soup = BeautifulSoup(html, "html.parser")
+        current_chunk_text = ""
+        current_chunk_id = 1
+        document_list = []
+        # 单位是token，openai限制4097，如果实现连续对话大概可以进行6轮对话
+        current_chunk_length = 0
+        chunk_size = 400
 
-        # 找到所有的标题标签
-        headings = soup.find_all(["h1", "h2", "h3"])
+        # 只处理前三级标题，其他的按照段落处理
+        headings = ["h1", "h2", "h3"]
+        headingDoms = soup.find_all(headings)
+
+        if len(headingDoms) == 0:
+            headingDoms = [soup.find()]
+
+        for tag in headingDoms:
+            tag["data-chunk_id"] = f"chunk-{current_chunk_id}"
+            current_chunk_text = tag.text.strip()
+
+            # 遍历所有兄弟节点，不递归遍历子节点
+            next_tag = tag.find_next_sibling()
+            while next_tag and next_tag.name not in headings:
+                stripped_text = next_tag.text.strip()
 
-        # 每个chunk的长度限制（单位token）
-        chunk_size = 400
-        document_list = []
-        index = 1
-
-        for i in range(len(headings) - 1):
-            start = headings[i]
-            end = headings[i + 1]
-            start["data-chunk_id"] = f"chunk-{index}"
-            content = start.next_elements
-            chunk_text = ""
-
-            for elem in content:
-                trim_text = elem.get_text().strip()
-                if not trim_text:
-                    continue
-                # 文本节点
-                if isinstance(elem, NavigableString):
-                    token_len = num_tokens_from_string(trim_text)
-                    if (
-                        num_tokens_from_string(chunk_text, "p50k_base") + token_len + 1
-                        < chunk_size
-                    ):
-                        chunk_text = f"{chunk_text} {trim_text}"
-                    elif token_len > chunk_size:
-                        # 单个的内容已经超过了chunk_size的情况，先将当前的chunk_text处理到并新建一个chunk
-                        index = index + 1
-                        document_list.append(
-                            Document(
-                                chunk_text.strip(),
-                                extra_info={"chunk_id": f"chunk-{index}"},
-                            )
-                        )
-                        for i in range(0, token_len, chunk_size):
-                            encode_text = encode_string(trim_text)
-                            decode_text = decode_string(
-                                encode_text[i : i + chunk_size]
-                            ).strip()
-                            document_list.append(
-                                Document(
-                                    decode_text,
-                                    extra_info={"chunk_id": f"chunk-{index}"},
-                                )
-                            )
-                    else:
-                        # chunk的数量超出了chunk_size，所以新开一个chunk
-                        index = index + 1
-                        document_list.append(
-                            Document(
-                                chunk_text.strip(),
-                                extra_info={"chunk_id": f"chunk-{index}"},
-                            )
-                        )
-                        chunk_text = ""
-                # 非文本节点
-                else:
-                    elem["data-chunk_id"] = f"chunk-{index}"
-                    if elem == end:
-                        document_list.append(
-                            Document(
-                                chunk_text.strip(),
-                                extra_info={"chunk_id": f"chunk-{index}"},
-                            )
-                        )
-                        chunk_text = ""
-                        index = index + 1
-                        elem["data-chunk_id"] = f"chunk-{index}"
-                        break
-
-        # TODO:
-        # if len(headings) == 0:
-        #     UnstructuredReader = download_loader("UnstructuredReader")
-        #     loader = UnstructuredReader()
-        #     documents = loader.load_data(f"{staticPath}/html/{filename}.html")
-        #     return documents
-
-        start = headings[-1]
-        start["data-chunk_id"] = f"chunk-{index}"
-        content = start.next_elements
-        chunk_text = ""
-
-        for elem in content:
-            trim_text = elem.get_text().strip()
-            if not trim_text:
-                continue
-            # 文本节点
-            if isinstance(elem, NavigableString):
-                token_len = num_tokens_from_string(trim_text)
                 if (
-                    num_tokens_from_string(chunk_text, "p50k_base") + token_len + 1
-                    < chunk_size
+                    current_chunk_length + num_tokens_from_string(stripped_text)
+                    > chunk_size
                 ):
-                    chunk_text = f"{chunk_text} {trim_text}"
-                elif token_len > chunk_size:
-                    # 单个的内容已经超过了chunk_size的情况，先将当前的chunk_text处理到并新建一个chunk
-                    index = index + 1
-                    document_list.append(
-                        Document(
-                            chunk_text.strip(),
-                            extra_info={"chunk_id": f"chunk-{index}"},
-                        )
-                    )
-                    chunk_text = ""
-
-                    for i in range(0, token_len, chunk_size):
-                        encode_text = encode_string(trim_text)
-                        decode_text = decode_string(
-                            encode_text[i : i + chunk_size]
-                        ).strip()
-                        document_list.append(
-                            Document(
-                                decode_text, extra_info={"chunk_id": f"chunk-{index}"}
-                            )
-                        )
-                else:
-                    # chunk的数量超出了chunk_size，所以新开一个chunk
-                    index = index + 1
                     document_list.append(
                         Document(
-                            chunk_text.strip(),
-                            extra_info={"chunk_id": f"chunk-{index}"},
+                            current_chunk_text.strip(),
+                            extra_info={"chunk_id": f"chunk-{current_chunk_id}"},
                         )
                     )
-                    chunk_text = ""
-            # 非文本节点
-            else:
-                elem["data-chunk_id"] = f"chunk-{index}"
+                    current_chunk_text = ""
+                    current_chunk_length = 0
+                    current_chunk_id += 1
 
-        document_list.append(
-            Document(chunk_text.strip(), extra_info={"chunk_id": f"chunk-{index}"})
-        )
+                    document_list += split_text_to_doc(stripped_text, current_chunk_id)
+
+                else:
+                    current_chunk_text = f"{current_chunk_text} {stripped_text}"
+                    current_chunk_length += num_tokens_from_string(stripped_text) + 1
+
+                next_tag["data-chunk_id"] = f"chunk-{current_chunk_id}"
+                next_tag = next_tag.find_next_sibling()
+
+            document_list.append(
+                Document(
+                    current_chunk_text.strip(),
+                    extra_info={"chunk_id": f"chunk-{current_chunk_id}"},
+                )
+            )
+            current_chunk_text = ""
+            current_chunk_length = 0
+            current_chunk_id += 1
 
         # 保存修改后的HTML文件
         with open(f"{staticPath}/html/{filename}.html", "w", encoding="utf-8") as f:

diff --git a/server/static/documents/test.md → server/static/documents/heading-test.md b/server/static/documents/test.md → server/static/documents/heading-test.md
@@ -1,5 +1,7 @@
 # Test Markdown
 
+The Test File chunk size is 10
+
 ## list
 
 - first
@@ -25,4 +27,4 @@
 
 ## Other
 
-**strong**
+long text**strong**long textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong text
diff --git a/server/static/documents/no-match-heading-test.md b/server/static/documents/no-match-heading-test.md
@@ -0,0 +1,28 @@
+The Test File chunk size is 10
+
+#### list
+
+- first
+- second
+- third
+
+#### Table
+
+| Syntax    | Description |
+| --------- | ----------- |
+| Header    | Title       |
+| Paragraph | Text        |
+
+#### CodeBlock
+
+```
+{
+  "firstName": "John",
+  "lastName": "Smith",
+  "age": 25
+}
+```
+
+#### Other
+
+long text**strong**long textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong text