In [11]:
!pip install llama-index  langchain

[0m

# 先搜索，后提示

In [12]:
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader,StorageContext

PERSIST_DIR = './data/mr_storage'

# if not os.path.exists(PERSIST_DIR):
documents = SimpleDirectoryReader('./data/mr_fujino').load_data()
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(PERSIST_DIR, 'index_mr_fujino.json')
# else:
    # storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    # index = load_index_from_storage(storage_context)

In [13]:
query_engine = index.as_query_engine()
response = query_engine.query("鲁迅先生在日本学习医学的老师是谁？")
print(response)

藤野先生


In [14]:
response = query_engine.query("鲁迅先生去哪里学的医学？")
print(response)

鲁迅先生去仙台学习医学。


In [15]:
from llama_index.core import PromptTemplate

query_str = "鲁迅先生去哪里学的医学？"

template  = (
    "Context information is below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the question: {query_str}\n"
)

qa_template = PromptTemplate(template)

response =index.as_query_engine(text_qa_template=qa_template).query(query_str)

print(response)

鲁迅先生去仙台学的医学。


In [16]:

template = (
    "下面的“我”指的是鲁迅先生 \n"
    "-----------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "根据这些信息，请回答问题：{query_str}\n"
    "如果您不知道的话，请回答不知道\n"
)

qa_template = PromptTemplate(template)

response = index.as_query_engine(text_qa_template=qa_template).query("请问林黛玉和贾宝玉是什么关系？")
print(response)


不知道


# 通过llama_index对于文章进行小结

In [17]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda12x]'
!python3 -m spacy download zh_core_web_sm
!pip install llama-index-llms-langchain

[0mCollecting zh-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl (48.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_sm')
[0m

In [18]:
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import SpacyTextSplitter
from llama_index.core import GPTListIndex, ServiceContext
from llama_index.core.node_parser import SimpleNodeParser

# spacy.prefer_gpu()
# nlp = spacy.load("en_core_web_sm")

# define LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", max_tokens=1024)

text_splitter = SpacyTextSplitter(pipeline="zh_core_web_sm", chunk_size = 2048)
parser = SimpleNodeParser(chunking_tokenizer_fn=text_splitter.split_text)

documents = SimpleDirectoryReader('./data/mr_fujino').load_data()
nodes = parser.get_nodes_from_documents(documents)

# define service context
service_context = ServiceContext.from_defaults(llm=llm)

index = GPTListIndex(nodes=nodes, service_context=service_context)
query_engine = index.as_query_engine(response_mode="tree_summarize")

response = query_engine.query("下面鲁迅先生以第一人称‘我’写的内容，请你用中文总结一下:")
print(response)

  service_context = ServiceContext.from_defaults(llm=llm)


内容描述了一个叫藤野先生的日本解剖学教授对一位中国学生的教育经历。学生在藤野先生的指导下学习解剖学，经历了学习、实习和考试等过程。藤野先生对学生的教育充满热心和希望，希望学生能够成为医学界的新秀。学生对藤野先生的教诲和帮助充满感激，认为他是一个伟大的人，尽管他的名字并不为众人所知。


# 引入多模态，让llamd-index能够识别小票

In [1]:
!pip install matplotlib
!pip install torch transformers sentencepiece Pillow
!pip install protobuf

[0m

In [2]:
from llama_index.core import SimpleDirectoryReader, GPTVectorStoreIndex
from llama_index.readers.file import ImageReader
from llama_index.core.response.notebook_utils import display_response, display_image
from llama_index.core.indices.query.query_transform.base import ImageOutputQueryTransform
from llama_index.core.query_engine import TransformQueryEngine

image_parser = ImageReader(keep_image=True, parse_text=True)
file_extractor = SimpleDirectoryReader.supported_suffix_fn()
file_extractor.update(
{
    ".jpg": image_parser,
    ".jpeg": image_parser,
    ".png": image_parser,
})

filename_fn = lambda filename: {'file_name': filename}

receipt_reader = SimpleDirectoryReader(
    input_dir='./data/receipts',
    file_extractor=file_extractor,
    file_metadata=filename_fn
)

receipt_documents = receipt_reader.load_data()

receipts_index = GPTVectorStoreIndex.from_documents(receipt_documents)

In [3]:
query_engine = TransformQueryEngine(
    query_engine=receipts_index.as_query_engine(similarity_top_k=1),
    query_transform=ImageOutputQueryTransform(width=400)
)

receipts_response = query_engine.query(
    'When was the last time I went to McDonald\'s and how much did I spend. \
    Also show me the receipt from my visit.'
)
# receipts_response
display_response(receipts_response)

**`Final Response:`** The last time you went to McDonald's was on March 10, 2018, at 07:39:12 PM. You spent a total of $26.15 during that visit.

Here is the receipt from your visit:
<img src="data/receipt.jpg" width="400" />

In [10]:
output_image = image_parser.load_data('./data/receipts/1100-receipt.jpg')
print(output_image[0].text)

<s_menu><s_nm> Story</s_nm><s_num> 16725 Stony Platin Rd</s_nm><s_num> Store#:</s_nm><s_num> 3659</s_num><s_price> 700-418-8362</s_price><sep/><s_nm> Welcome to all day breakfast dormist O Md Donald's</s_nm><s_num> 192</s_num><s_price> 192</s_price><sep/><s_nm> QTY ITEM</s_nm><s_num> OTAL</s_num><s_unitprice> 03/10/2018</s_unitprice><s_cnt> 1</s_cnt><s_price> 07:39:12 PM</s_price><sep/><s_nm> Delivery</s_nm><s_cnt> 1</s_cnt><s_price> 0.00</s_price><sep/><s_nm> 10 McNuggets EVM</s_nm><s_cnt> 1</s_cnt><s_price> 10.29</s_price><sep/><s_nm> Barbeque Sauce</s_nm><s_cnt> 1</s_cnt><s_price> 1</s_price><sep/><s_nm> Barbeque Sauce</s_nm><s_num> 1</s_cnt><s_price> 0.40</s_price><sep/><s_nm> L Coke</s_nm><s_cnt> 1</s_cnt><s_price> 0.40</s_price><sep/><s_nm> M French Fries</s_nm><s_cnt> 1</s_cnt><s_price> 3.99</s_price><sep/><s_nm> HM GrChS S-Fry Yog</s_nm><s_cnt> 1</s_cnt><sep/><s_nm> Smoonya</s_nm><s_cnt> 1</s_cnt><sep/><s_nm> HM Apple Juice</s_nm><s_cnt> 1</s_cnt><s_price> 2.89</s_price><sep/><

In [11]:
!pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                                 Version
--------------------------------------- --------------
aiofiles                                22.1.0
aiohttp                                 3.9.5
aiosignal                               1.3.1
aiosqlite                               0.20.0
altair                                  5.3.0
annotated-types                         0.6.0
anyio                                   4.3.0
argon2-cffi                             23.1.0
argon2-cffi-bindings                    21.2.0
arrow                                   1.3.0
astroid                                 3.1.0
asttokens                               2.4.1
async-timeout                           4.0.3
attrs                                   23.2.0
autopep8                                2.0.4
Babel                                   2.14.0
backoff                                 2.2.1
beautifulsoup4                          4.12.3
bleach                                  6.1.0
blis            