# 🚀Naive RAG with Llamaindex

首先下载embedding model.

```shell
export HF_ENDPOINT=https://hf-mirror.com  # Mac, Linux
# $env:HF_ENDPOINT = "https://hf-mirror.com" # Windows
huggingface-cli download --resume-download shibing624/text2vec-base-chinese --local-dir /data/hf/text2vec-base-chinese --cache-dir /tmp/cache --local-dir-use-symlinks False
huggingface-cli download --resume-download NousResearch/Meta-Llama-3-8B-Instruct --local-dir /data/hf/Meta-Llama-3-8B-Instruct --cache-dir /tmp/cache --local-dir-use-symlinks False
rm -rf /tmp/cache
```

安装llamaindex相关组件.

```shell
pip install llama-index
pip install llama-index-vector-stores-chroma  # 向量数据
pip install llama-index-embeddings-huggingface # 加载embedding 模型
pip install llama-index-llms-vllm # 加载语言模型
```

In [None]:
import base64
import os
from langsmith.run_helpers import traceable
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.text_splitter import SentenceSplitter, TokenTextSplitter
from llama_index.llms.vllm import Vllm

In [None]:
llm = Vllm(
    model="/data/hf/Meta-Llama-3-8B-Instruct",
    tensor_parallel_size=2,
    max_new_tokens=300,
    dtype="bfloat16",
)

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(
    model="/data/hf/Meta-Llama-3-8B-Instruct",
    trust_remote_code=True,
    tensor_parallel_size=2,
)

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoModelForCausalLM.from_pretrained("/data/hf/Meta-Llama-3-8B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.90it/s]


In [1]:
import transformers
import torch

model_id = "/data/hf/Meta-Llama-3-8B-Instruct/"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.92it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Arrrr, me hearty! Me name be Captain Chatbot, the scurviest scallywag to ever sail the Seven Seas! I be a pirate chatbot, here to swab the decks of yer conversations and keep ye entertained with me witty banter and sea-faring savvy! So hoist the colors, me hearty, and let's set sail fer a swashbucklin' good time!
