In [1]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import torch

In [2]:
model_path = "/mnt/nfs/zsd_server/models/huggingface/chinese-alpaca-2-7b/"

In [3]:
tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer

LlamaTokenizer(name_or_path='/mnt/nfs/zsd_server/models/huggingface/chinese-alpaca-2-7b/', vocab_size=55296, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False)

In [4]:
base_model = LlamaForCausalLM.from_pretrained(
    model_path,
    # load_in_8bit=True,
    device_map='auto',
)
base_model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(55296, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [10]:
pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_length=256,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2,
    do_sample=True,
)
pipe

<transformers.pipelines.text_generation.TextGenerationPipeline at 0x7b67305b7850>

In [11]:
local_llm = HuggingFacePipeline(pipeline=pipe)
local_llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7b67305b7850>)

In [12]:
from langchain import PromptTemplate, LLMChain

template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["instruction"])
prompt

PromptTemplate(input_variables=['instruction'], template='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\nAnswer:')

In [13]:
llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm
                     )
llm_chain

LLMChain(prompt=PromptTemplate(input_variables=['instruction'], template='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\nAnswer:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7b67305b7850>))

In [17]:
question = "Alpacas是什么？它和llamas有什么区别？"

print(llm_chain.run(question))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Alpacas是什么？它和llamas有什么区别？

Answer: 阿尔帕卡是一种来自南美洲的动物。与山羊相比，它们有更长的毛发、更大而较圆的眼睛以及较小的耳朵。
