Skip to content

Commit

Permalink
LIST INDEX OUT OF RANGE
Browse files Browse the repository at this point in the history
Fixes #8
  • Loading branch information
3Alan committed May 3, 2023
1 parent 917492e commit 4c69c6a
Show file tree
Hide file tree
Showing 18 changed files with 1,947 additions and 1,910 deletions.
3 changes: 2 additions & 1 deletion client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
"query-string": "^8.1.0",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-router-dom": "^6.9.0"
"react-router-dom": "^6.9.0",
"sass": "^1.62.1"
},
"devDependencies": {
"@types/canvas-confetti": "^1.6.0",
Expand Down
2 changes: 1 addition & 1 deletion client/src/App.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import './styles/globals.css';
import './styles/globals.scss';
import 'github-markdown-css/github-markdown-light.css';
import { BrowserRouter, Route, Routes } from 'react-router-dom';
import routes from './routes';
Expand Down
23 changes: 20 additions & 3 deletions client/src/styles/globals.css → client/src/styles/globals.scss
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,26 @@ body {
overflow: hidden;
}

.markdown-body .hl-source {
background-color: #eff6ff;
border-radius: 6px;
.markdown-body {
table {
&.hl-source {
border-radius: 0;

tr, td {
background-color: #eff6ff;
border-radius: 0;
}
}
}

.hl-source {
background-color: #eff6ff;
border-radius: 6px;

pre {
background-color: #eff6ff !important;
}
}
}

.markdown-body {
Expand Down
18 changes: 16 additions & 2 deletions client/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1023,7 +1023,7 @@ chalk@^4.0.0:
ansi-styles "^4.1.0"
supports-color "^7.1.0"

chokidar@^3.5.3:
"chokidar@>=3.0.0 <4.0.0", chokidar@^3.5.3:
version "3.5.3"
resolved "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd"
integrity sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==
Expand Down Expand Up @@ -1841,6 +1841,11 @@ ignore@^5.2.0:
resolved "https://registry.npmjs.org/ignore/-/ignore-5.2.4.tgz#a291c0c6178ff1b960befe47fcdec301674a6324"
integrity sha512-MAb38BcSbH0eHNBxn7ql2NH/kX33OkB3lZ1BNdh7ENeRChHTYsTvWrMubiIAMNS2llXEEgZ1MUOBtXChP3kaFQ==

immutable@^4.0.0:
version "4.3.0"
resolved "https://registry.npmjs.org/immutable/-/immutable-4.3.0.tgz#eb1738f14ffb39fd068b1dbe1296117484dd34be"
integrity sha512-0AOCmOip+xgJwEVTQj1EfiDDOkPmuyllDuTuEX+DDXUgapLAsBIfkg3sxCYyCEA8mQqZrrxPUGjcOQ2JS3WLkg==

import-fresh@^3.0.0, import-fresh@^3.2.1:
version "3.3.0"
resolved "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz#37162c25fcb9ebaa2e6e53d5b4d88ce17d9e0c2b"
Expand Down Expand Up @@ -3032,6 +3037,15 @@ safe-regex-test@^1.0.0:
get-intrinsic "^1.1.3"
is-regex "^1.1.4"

sass@^1.62.1:
version "1.62.1"
resolved "https://registry.npmjs.org/sass/-/sass-1.62.1.tgz#caa8d6bf098935bc92fc73fa169fb3790cacd029"
integrity sha512-NHpxIzN29MXvWiuswfc1W3I0N8SXBd8UR26WntmDlRYf0bSADnwnOjsyMZ3lMezSlArD33Vs3YFhp7dWvL770A==
dependencies:
chokidar ">=3.0.0 <4.0.0"
immutable "^4.0.0"
source-map-js ">=0.6.2 <2.0.0"

scheduler@^0.23.0:
version "0.23.0"
resolved "https://registry.npmjs.org/scheduler/-/scheduler-0.23.0.tgz#ba8041afc3d30eb206a487b6b384002e4e61fdfe"
Expand Down Expand Up @@ -3089,7 +3103,7 @@ slash@^4.0.0:
resolved "https://registry.npmjs.org/slash/-/slash-4.0.0.tgz#2422372176c4c6c5addb5e2ada885af984b396a7"
integrity sha512-3dOsAHXXUkQTpOYcoAxLIorMTp4gIQr5IW3iVb7A7lFIp0VHhnynm9izx6TssdrIcVIESAlVjtnO2K8bg+Coew==

source-map-js@^1.0.2:
"source-map-js@>=0.6.2 <2.0.0", source-map-js@^1.0.2:
version "1.0.2"
resolved "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz#adbc361d9c62df380125e7f161f71c826f1e490c"
integrity sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==
Expand Down
2 changes: 1 addition & 1 deletion server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def query_index():

res = index.query(query_text, streaming=True)
cost = embed_model.last_token_usage + llm_predictor.last_token_usage
sources = [{"extraInfo": x.extra_info} for x in res.source_nodes]
sources = [{"extraInfo": x.node.extra_info} for x in res.source_nodes]

def response_generator():
yield json.dumps({"cost": cost, "sources": sources})
Expand Down
194 changes: 66 additions & 128 deletions server/custom_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import tiktoken
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

Expand All @@ -26,149 +25,88 @@ def num_tokens_from_string(string: str, encoding_name: str = "p50k_base") -> int
return num_tokens


def split_text_to_doc(
text: str, current_chunk_id, chunk_size: int = 400
) -> List[Document]:
"""Split text into chunks of a given size."""
chunks = []
token_len = num_tokens_from_string(text)

for i in range(0, token_len, chunk_size):
encode_text = encode_string(text)
decode_text = decode_string(encode_text[i : i + chunk_size]).strip()
chunks.append(
Document(
decode_text,
extra_info={"chunk_id": f"chunk-{current_chunk_id}"},
)
)

return chunks


class CustomReader(BaseReader):
def __init__(self, *args: Any, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)

def load_data(self, html, filename) -> List[Document]:
# 解析HTML
soup = BeautifulSoup(html, "html.parser")
current_chunk_text = ""
current_chunk_id = 1
document_list = []
# 单位是token,openai限制4097,如果实现连续对话大概可以进行6轮对话
current_chunk_length = 0
chunk_size = 400

# 找到所有的标题标签
headings = soup.find_all(["h1", "h2", "h3"])
# 只处理前三级标题,其他的按照段落处理
headings = ["h1", "h2", "h3"]
headingDoms = soup.find_all(headings)

if len(headingDoms) == 0:
headingDoms = [soup.find()]

for tag in headingDoms:
tag["data-chunk_id"] = f"chunk-{current_chunk_id}"
current_chunk_text = tag.text.strip()

# 遍历所有兄弟节点,不递归遍历子节点
next_tag = tag.find_next_sibling()
while next_tag and next_tag.name not in headings:
stripped_text = next_tag.text.strip()

# 每个chunk的长度限制(单位token)
chunk_size = 400
document_list = []
index = 1

for i in range(len(headings) - 1):
start = headings[i]
end = headings[i + 1]
start["data-chunk_id"] = f"chunk-{index}"
content = start.next_elements
chunk_text = ""

for elem in content:
trim_text = elem.get_text().strip()
if not trim_text:
continue
# 文本节点
if isinstance(elem, NavigableString):
token_len = num_tokens_from_string(trim_text)
if (
num_tokens_from_string(chunk_text, "p50k_base") + token_len + 1
< chunk_size
):
chunk_text = f"{chunk_text} {trim_text}"
elif token_len > chunk_size:
# 单个的内容已经超过了chunk_size的情况,先将当前的chunk_text处理到并新建一个chunk
index = index + 1
document_list.append(
Document(
chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{index}"},
)
)
for i in range(0, token_len, chunk_size):
encode_text = encode_string(trim_text)
decode_text = decode_string(
encode_text[i : i + chunk_size]
).strip()
document_list.append(
Document(
decode_text,
extra_info={"chunk_id": f"chunk-{index}"},
)
)
else:
# chunk的数量超出了chunk_size,所以新开一个chunk
index = index + 1
document_list.append(
Document(
chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{index}"},
)
)
chunk_text = ""
# 非文本节点
else:
elem["data-chunk_id"] = f"chunk-{index}"
if elem == end:
document_list.append(
Document(
chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{index}"},
)
)
chunk_text = ""
index = index + 1
elem["data-chunk_id"] = f"chunk-{index}"
break

# TODO:
# if len(headings) == 0:
# UnstructuredReader = download_loader("UnstructuredReader")
# loader = UnstructuredReader()
# documents = loader.load_data(f"{staticPath}/html/{filename}.html")
# return documents

start = headings[-1]
start["data-chunk_id"] = f"chunk-{index}"
content = start.next_elements
chunk_text = ""

for elem in content:
trim_text = elem.get_text().strip()
if not trim_text:
continue
# 文本节点
if isinstance(elem, NavigableString):
token_len = num_tokens_from_string(trim_text)
if (
num_tokens_from_string(chunk_text, "p50k_base") + token_len + 1
< chunk_size
current_chunk_length + num_tokens_from_string(stripped_text)
> chunk_size
):
chunk_text = f"{chunk_text} {trim_text}"
elif token_len > chunk_size:
# 单个的内容已经超过了chunk_size的情况,先将当前的chunk_text处理到并新建一个chunk
index = index + 1
document_list.append(
Document(
chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{index}"},
)
)
chunk_text = ""

for i in range(0, token_len, chunk_size):
encode_text = encode_string(trim_text)
decode_text = decode_string(
encode_text[i : i + chunk_size]
).strip()
document_list.append(
Document(
decode_text, extra_info={"chunk_id": f"chunk-{index}"}
)
)
else:
# chunk的数量超出了chunk_size,所以新开一个chunk
index = index + 1
document_list.append(
Document(
chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{index}"},
current_chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{current_chunk_id}"},
)
)
chunk_text = ""
# 非文本节点
else:
elem["data-chunk_id"] = f"chunk-{index}"
current_chunk_text = ""
current_chunk_length = 0
current_chunk_id += 1

document_list.append(
Document(chunk_text.strip(), extra_info={"chunk_id": f"chunk-{index}"})
)
document_list += split_text_to_doc(stripped_text, current_chunk_id)

else:
current_chunk_text = f"{current_chunk_text} {stripped_text}"
current_chunk_length += num_tokens_from_string(stripped_text) + 1

next_tag["data-chunk_id"] = f"chunk-{current_chunk_id}"
next_tag = next_tag.find_next_sibling()

document_list.append(
Document(
current_chunk_text.strip(),
extra_info={"chunk_id": f"chunk-{current_chunk_id}"},
)
)
current_chunk_text = ""
current_chunk_length = 0
current_chunk_id += 1

# 保存修改后的HTML文件
with open(f"{staticPath}/html/{filename}.html", "w", encoding="utf-8") as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Test Markdown

The Test File chunk size is 10

## list

- first
Expand All @@ -25,4 +27,4 @@

## Other

**strong**
long text**strong**long textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong text
28 changes: 28 additions & 0 deletions server/static/documents/no-match-heading-test.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
The Test File chunk size is 10

#### list

- first
- second
- third

#### Table

| Syntax | Description |
| --------- | ----------- |
| Header | Title |
| Paragraph | Text |

#### CodeBlock

```
{
"firstName": "John",
"lastName": "Smith",
"age": 25
}
```

#### Other

long text**strong**long textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong textlong text
Loading

0 comments on commit 4c69c6a

Please sign in to comment.