In [None]:
#llm = LLM(model="EleutherAI/gpt-j-6b")
%load_ext autoreload
%autoreload 2

In [34]:
from vllm import LLM, SamplingParams

prompts = [
    "Make a sentence using the verb 'retire'.  Sentence: ",
    "Make a sentence using the verb 'retire'.  Sentence: ",
    "Make a sentence using the verb 'retire'.  Sentence: ",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens = 40)

In [None]:
outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

### Try out Llama-8b

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
PAD_TOKEN = "<|pad|>"
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"
# takes 2-3 minutes
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    #     attn_implementation="flash_attention_2",
    #     attn_implementation="sdpa",
    device_map="auto",
)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

In [8]:
from transformers import pipeline

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    return_full_text=False,
)

In [13]:
# template
def create_test_prompt(word: str):
    prompt_format = "make three sentences in Japanese using the verb {}."
    prompt = prompt_format.format(word)
    messages = [
        {"role": "user", "content": prompt},
    ]
    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

In [None]:
# let's run some inference
# 1. generation task

# system warmup
SYSTEM_START = "You can speak both japanese and english fluently. Use Hiragana and kanji."
USER_START = "make three sentences in Japanese using the verb '慣れる'"
warmup_prompt = tokenizer.apply_chat_template(
        [{
            "role": "system",
            "content": SYSTEM_START,
        }, {
            "role": "user",
            "content": USER_START,
        }
        ], tokenize=False, add_generation_prompt=True )
outputs = pipe(warmup_prompt)

# start
WORDS_LIST = ["𠮟る","変える","行なう", "冷める"]
prompts = [create_test_prompt(word) for word in WORDS_LIST]
for prompt in prompts:
    print(prompt)
    print("\nPrediction:")
    outputs = pipe(prompt)
    print(outputs[0]["generated_text"])
    print("\n\n")

That was bullshit. Let's try a model that can speak Japanese.
sadly, there's only 70b model for llama 3 in japanese.
### Try out Llama2-Japanese-7b

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。"
text = "クマが海辺に行ってアザラシと友達になり、最終的には家に帰るというプロットの短編小説を書いてください。"
# will take 4-6 minutes
model_name = "elyza/ELYZA-japanese-Llama-2-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")


In [3]:
from transformers import pipeline

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1200,
    return_full_text=False,
)


In [None]:
from prompt.Llama2JP_prompter import Llama2JPPrompter

WORDS_LIST = ["起きる","変える","行なう", "冷める", "謝る", "抱く", "決める", "高める"]

prompter = Llama2JPPrompter(tokenizer)
prompts = [prompter.prompt_generate_sentences_fewshot(word) for word in WORDS_LIST]

for prompt in prompts:
    print(prompt)
    print("\n\n\n")

In [None]:
output = pipe(prompts, batch_size = len(WORDS_LIST))

In [None]:
for prompt in prompts:
    print(prompt)
    print("\nPrediction:")
    outputs = pipe(prompt)
    print(outputs[0]["generated_text"])
    print("\n\n")

In [None]:

DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。"
inst = "make three different sentences in Japanese using the word '{}'."
llama_2_template = \
            "{% for message in messages %}" \
                "{% if message['role'] == 'user' %}"  \
                    "{{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST] ' }}"\
                "{% elif message['role'] == 'system' %}"\
                    "{{ '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}"\
                "{% elif message['role'] == 'assistant' %}"\
                    "{{ ' '  + message['content'].strip() + ' ' + eos_token }}"\
                "{% endif %}"\
            "{% endfor %}"
tokenizer.chat_template = llama_2_template


prompts = []

for word in WORDS_LIST:
    messages = [
        {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
        {"role": "user", "content": inst.format(word)},
    ]
    prompts.append(tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    ))

for prompt in prompts:
    print(prompt)

In [None]:

system_output = pipe(warmup_prompt)
for prompt in prompts:
    print(prompt)
    print("\nPrediction:")
    outputs = pipe(prompt)
    print(outputs[0]["generated_text"])
    print("\n\n")


In [None]:
# measure inference time
import time

system_output = pipe(warmup_prompt)
duration = 0.0
NUM_WARMUP = 0
NUM_REPEATS = 1
for i in range(NUM_REPEATS+NUM_WARMUP):
    start_time = time.time()
    for prompt in prompts:
        outputs = pipe(prompt)
    finish_time = time.time()
    if i >= NUM_WARMUP:
        duration += finish_time - start_time
print(len(outputs))
print(outputs[0])
print(duration/NUM_REPEATS)

In [None]:
system_output = pipe(warmup_prompt)
duration = 0.0
for i in range(NUM_REPEATS+NUM_WARMUP):
    start_time = time.time()
    outputs = pipe(prompts, batch_size = len(prompts))
    finish_time = time.time()
    if i >= NUM_WARMUP:
        duration += finish_time - start_time
print(len(outputs))
print(outputs[0])
print(duration/NUM_REPEATS)

In [None]:
for prompt in prompts:
    token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    with torch.no_grad():
        generation = model.generate(
            token_ids.to(model.device),
            max_new_tokens=256,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.6,
        )

    output = tokenizer.decode(generation.tolist()[0][token_ids.size(1) :], skip_special_tokens=True)
    print(prompt)
    print("\nPrediction:")
    print(output)
    print("\n\n")

In [None]:
# let's look at the tokens more closely.
import time
prompt = prompts[4]
duration = 0.0
NUM_WARMUP = 0
NUM_REPEATS = 1
for i in range(NUM_REPEATS+NUM_WARMUP):
    start_time = time.time()
    token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    with torch.no_grad():
        generation = model.generate(
            token_ids.to(model.device),
            max_new_tokens=1200,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.6,
        )

    output = tokenizer.decode(generation.tolist()[0][token_ids.size(1) :], skip_special_tokens=True)
    finish_time = time.time()
    if i >= NUM_WARMUP:
        duration += finish_time - start_time

print(prompt)
print("Result:")
print(output)

print("Inference Time (s)")
print(duration/NUM_REPEATS)

### Let's Try out Llama3-8B

This is a more recent version of llama2 model

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。"
text = "仕事の熱意を取り戻すためのアイデアを5つ挙げてください。"

model_name = "elyza/Llama-3-ELYZA-JP-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
)
model.eval()

messages = [
    {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
    {"role": "user", "content": text},
]
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
token_ids = tokenizer.encode(
    prompt, add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    output_ids = model.generate(
        token_ids.to(model.device),
        max_new_tokens=1200,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
output = tokenizer.decode(
    output_ids.tolist()[0][token_ids.size(1):], skip_special_tokens=True
)
print(output)


  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 4/4 [01:21<00:00, 20.25s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.68it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


仕事の熱意を取り戻すためのアイデアを5つ提案します。

1. 目標の再設定: 現状の目標が曖昧や遠いものになっている可能性があります。具体的で挑戦的な目標を再設定し、達成するために必要な行動を明確にします。目標を達成するイメージを強く持つことで、仕事に対する熱意が再燃します。

2. 小さな成功体験の積み重ね: 大きな目標を達成するためには、日々の小さな成功体験が必要です。小さな目標を設定し、達成することで自信を取り戻し、仕事に対する熱意が再燃します。

3. 新しいスキルや知識の習得: 現状の仕事に新しいスキルや知識を加えることで、仕事に対する新鮮さや面白さを取り戻すことができます。新しいスキルや知識を習得するためには、学習や研修、セミナーなどに参加する必要があります。

4. 上司や同僚とのコミュニケーション改善: 上司や同僚とのコミュニケーションがうまくいっていない場合、仕事に対する熱意が低下することがあります。コミュニケーションを改善するためには、積極的に会話をし、理解し合うことが大切です。

5. 自分のやりたいことの再確認: 自分がなぜこの仕事を始めたのか、自分がやりたいことは何なのかを再確認することが大切です。仕事に対する情熱ややりがいを再確認することで、仕事に対する熱意を取り戻すことができます。


In [3]:
# running this cell will make sentence task
WORDS_LIST = ["起きる","変える","行なう", "冷める", "謝る", "抱く", "決める", "高める", "泊まる", "伺う", "捨てる", "響く", "積もる", "読む", "閉じる", "渡す"]
prompts = []
# inst = "make three different sentences in Japanese using the word '{}'."
inst = "『{}』を使って例文を3つ作ってください。"
for word in WORDS_LIST:
    messages = [
        {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
        {"role": "user", "content": inst.format(word)},
    ]
    prompts.append(tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    ))


In [5]:
# running this cell will make paragraph task prompt
inst = "{}について短い文を書いてください。"
topic = "マイクロプラスチック"
prompts = []
messages = [
    {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
    {"role": "user", "content": inst.format(topic)},
]
prompts.append(tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
))


In [None]:
# running this cell will make paragraph task prompt
inst = "#命令文\n"
"以下の#入力条件と#制約条件をもとに、4択式のクイズを作成してください。出力は#出力形式に従って行ってください。#入力条件\n"
"{}"
"#制約条件\n"
"- 問題数: 2問\n"
"- 問題1から問題2までのすべてについて、問題を作成する\n"
"- 回答選択肢は4択式とする\n"
"- 選択肢には数字の番号を振る\n"
"\n"
"#出力形式\n"
"- 問題1:\n"
"問題:\n"
"選択肢:\n"
"問題2:\n"
"問題:\n"
"選択肢:\n"

topic = "マイクロプラスチック"
prompts = []
messages = [
    {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
    {"role": "user", "content": inst.format(topic)},
]
prompts.append(tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
))


In [7]:
for prompt in prompts:
    token_ids = tokenizer.encode(
        prompt, add_special_tokens=False, return_tensors="pt"
    )

    with torch.no_grad():
        output_ids = model.generate(
            token_ids.to(model.device),
            max_new_tokens=600,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
    output = tokenizer.decode(
        output_ids.tolist()[0][token_ids.size(1):], skip_special_tokens=True
    )
    print(prompt)
    print("\nPrediction:")
    print(output)
    print("\n\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>

マイクロプラスチックについて短い文を書いてください。<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Prediction:
マイクロプラスチックは、5ミリメートル以下の小さなプラスチック粒子のことです。日常生活で使われるプラスチック製品が、使用や廃棄の過程で小さく砕かれて海や川に流れ込み、環境中で分解されずに残ります。小さな魚や貝などが誤って摂取し、生態系に影響を与えることが懸念されています。





In [3]:

# measure inference time
import time

batch_prompts = prompts[:16]
duration = 0.0
NUM_WARMUP = 10
NUM_REPEATS = 20
outputs = []
tokenizer.pad_token_id = tokenizer.eos_token_id
for i in range(NUM_REPEATS+NUM_WARMUP):
    start_time = time.time()
    token_ids = tokenizer(
        batch_prompts, add_special_tokens=False, return_tensors="pt", padding=True
    ).input_ids
    with torch.no_grad():
        output_ids = model.generate(
            token_ids.to(model.device),
            max_new_tokens=600,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
    batch_outputs = []
    for output in output_ids.tolist():
        batch_outputs.append(tokenizer.decode(
            output[token_ids.size(1):], skip_special_tokens=True
        ))
    finish_time = time.time()
    if i >= NUM_WARMUP:
        duration += finish_time - start_time
    if i == 0:
        outputs = batch_outputs

for (prompt, output) in zip(batch_prompts, outputs):
    print(prompt)
    print("\nPrediction:")
    print(output)
    print("\n\n")

print(f"n_batch={len(batch_prompts)} ")
print(duration/len(batch_prompts)/NUM_REPEATS)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>

『起きる』を使って例文を3つ作ってください。<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Prediction:
以下は「起きる」を使った例文です。

1. 明日は早く起きるつもりなので、早く寝よう。
2.地震で目が覚めて起きるが、外は大変なことになっていた。
3.長い間寝ていたが、疲れが取れないので起きることにした。



<|begin_of_text|><|start_header_id|>system<|end_header_id|>

あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>

『変える』を使って例文を3つ作ってください。<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Prediction:
assistant

以下は、『変える』を使った例文です。

1. 彼は生活習慣を変えるために、早寝早起きを心がけるようになった。

2. 新しいプロジェクトが始まるにあたり、部長は社内の体制を変える決断を下した。

3. この機会に、彼女は服装を変えることにした。



<|begin_of_text|><|start_header_id|>system<|end_header_id|>

あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>

『行なう』を使って例文を3つ作ってください。<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Pre

### Quantized VLLM version!
this is an even faster version

In [1]:
from vllm import LLM, SamplingParams

llm = LLM(model="elyza/Llama-3-ELYZA-JP-8B-AWQ", quantization="awq")
tokenizer = llm.get_tokenizer()

DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。"
sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=600)
messages_batch = [
    [
        {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
        {"role": "user", "content": "古代ギリシャを学ぶ上で知っておくべきポイントは？"}
    ],
    [
        {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
        {"role": "user", "content": "クマが海辺に行ってアザラシと友達になり、最終的には家に帰るというプロットの短編小説を書いてください。"}
    ]
]

prompts = [
    tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    for messages in messages_batch
]

outputs = llm.generate(prompts, sampling_params)

  from .autonotebook import tqdm as notebook_tqdm
2024-10-16 14:23:58,475	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


INFO 10-16 14:24:02 awq_marlin.py:101] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 10-16 14:24:02 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='elyza/Llama-3-ELYZA-JP-8B-AWQ', speculative_config=None, tokenizer='elyza/Llama-3-ELYZA-JP-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  8.12it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00,  2.48it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00,  2.77it/s]



INFO 10-16 14:26:14 model_runner.py:1071] Loading model weights took 5.3440 GB
INFO 10-16 14:26:16 gpu_executor.py:122] # GPU blocks: 7162, # CPU blocks: 2048
INFO 10-16 14:26:16 gpu_executor.py:126] Maximum concurrency for 8192 tokens per request: 13.99x
INFO 10-16 14:26:17 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-16 14:26:17 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-16 14:26:28 model_runner.py:1530] Graph capturing finished in 12 secs.


Processed prompts: 100%|██████████| 2/2 [00:04<00:00,  2.38s/it, est. speed input: 32.79 toks/s, output: 237.93 toks/s]


In [12]:
WORDS_LIST = ["起きる","変える","行なう", "冷める", "謝る", "抱く", "決める", "高める", "泊まる", "伺う", "捨てる", "響く", "積もる", "読む", "閉じる", "渡す"]
prompts = []
# inst = "make three different sentences in Japanese using the word '{}'."
inst = "『{}』を使って例文を3つ作ってください。"
for word in WORDS_LIST:
    messages = [
        {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
        {"role": "user", "content": inst.format(word)},
    ]
    prompts.append(tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    ))

In [14]:
# measure time
import time
batch_prompts = prompts[:16]
sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=600)
duration = 0.0
NUM_WARMUP = 10
NUM_REPEATS = 20
outputs = []
tokenizer.pad_token_id = tokenizer.eos_token_id
for i in range(NUM_REPEATS+NUM_WARMUP):
    start_time = time.time()
    vllmresults = llm.generate(batch_prompts, sampling_params)
    finish_time = time.time()
    if i >= NUM_WARMUP:
        duration += finish_time - start_time
    if i == 0:
        outputs = [res.outputs[0].text for res in vllmresults]

for (prompt, output) in zip(batch_prompts, outputs):
    print(prompt)
    print("\nPrediction:")
    print(output)
    print("\n\n")


print(f"n_batch={len(batch_prompts)} ")
print(duration/len(batch_prompts)/NUM_REPEATS)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s, est. speed input: 96.48 toks/s, output: 125.28 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s, est. speed input: 90.90 toks/s, output: 128.88 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.53it/s, est. speed input: 102.45 toks/s, output: 128.44 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s, est. speed input: 71.62 toks/s, output: 129.35 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s, est. speed input: 110.00 toks/s, output: 128.06 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it, est. speed input: 54.46 toks/s, output: 130.06 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it, est. speed input: 62.52 toks/s, output: 129.70 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.61it/s, est. speed input: 242.03 toks/s, output: 122.81 toks/s]
Processed prompts: 100%|██████████| 1

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

あなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>

『起きる』を使って例文を3つ作ってください。<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Prediction:
以下は、『起きる』を使用した例文です。

1.明日は早起きして、早朝の散歩をしてみよう。
2.この間、突然の地震で目が覚め、起きてみると家具が倒れていた。
3.明後日は、早起きして、遠くから見える初日の出を拝みたい。



n_batch=1 
0.6402472615242004





In [None]:
import MeCab
from collections import namedtuple
tagger = MeCab.Tagger("--node-format=%f%m")
tagged = tagger.parse("彼女は毎日のように濯物洗を行なう").split('\n')

TaggedWord = namedtuple('TaggedWord', 'word, yomi, pos')

parts = []
for line in tagged:
    tags = line.split('\t')
    if len(tags) >= 5:
        parts.append(TaggedWord(word=tags[0], yomi=tags[1], pos=tags[4]))
    
print(parts)


In [None]:
import re

sents = re.split("\n[0-9][. ]*", "works out as follows\n1.  fast.\n2 slow 3. without newline!\n\n4. with new\n\n")
sents = [sent.strip() for sent in sents[1:]]
print(sents)