### Leader-Board
##### https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import pipeline
import transformers
import torch
import os 

In [None]:
access_tok = '-'

In [None]:
default_path = os.getcwd()
config_path = os.path.join(default_path, '../../config')

### Installed Model List (docker: fingerai/llm) 

In [3]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0
[0m

In [6]:
# Case: Load model directly
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, AutoConfig
from peft import PeftModel

generation_config = dict(
    temperature=0.3,
    top_k=40,
    top_p=0.9,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.1,
    max_new_tokens=400
    )

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = AutoConfig.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=access_tok)

NameError: name 'access_tok' is not defined

In [None]:
model = LlamaForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    # low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    token=access_tok
)

tokenizer = LlamaTokenizer.from_pretrained('Chang-Su/llama-2-7b-chat-ko')
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, 'Chang-Su/llama-2-7b-chat-ko')
model.eval()

In [None]:
input_text = '신용등급 알려줘'
with torch.no_grad():
    print("Start inference.")
    results = []
    inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
    generation_output = model.generate(
        input_ids = inputs["input_ids"].to('cuda:0'),
        attention_mask = inputs['attention_mask'].to('cuda:0'),
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        **generation_config
    )
    s = generation_output[0]
    output = tokenizer.decode(s,skip_special_tokens=True)

    response = output.split("### Response:")[0].strip()
    print(f"====================")
    print(f"Input: '{input_text}'\n")
    print(f"Output: {response}\n")

    results.append({"Input":input_text,"Output":response})

#### Ko-LLM 2 
##### https://huggingface.co/kfkas/Llama-2-ko-7b-Chat

In [None]:
def gen(x, model, tokenizer, device):
    prompt = (
        f"아래는 작업을 설명하는 명령어입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{x}\n\n### 응답:"
    )
    len_prompt = len(prompt)
    gened = model.generate(
        **tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(
            device
        ),
        max_new_tokens=1024,
        early_stopping=True,
        do_sample=True,
        top_k=20,
        top_p=0.92,
        no_repeat_ngram_size=3,
        eos_token_id=2,
        repetition_penalty=1.2,
        num_beams=3
    )
    return tokenizer.decode(gened[0])[len_prompt:]

In [None]:
def LLM_infer(input):
    device = (
        torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    )
    model_id = "kfkas/Llama-2-ko-7b-Chat"
    model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map={"": 0},torch_dtype=torch.float16, low_cpu_mem_usage=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model.eval()
    model.config.use_cache = (True)
    tokenizer.pad_token = tokenizer.eos_token
    output = gen(input, model=model, tokenizer=tokenizer, device=device)
    return output

In [None]:
if __name__ == "__main__":
    text = LLM_infer("너는 누구야 ? ")
    print(text)

In [None]:
if __name__ == "__main__":
    text = LLM_infer("신용등급 알려줘")
    print(text)

In [None]:
if __name__ == "__main__":
    text = LLM_infer("DNA는 무엇의 약자인가요")
    print(text)

#### Ko-LLM 3 

In [None]:
config._name_or_path

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import FalconModel, FalconConfig
from peft import PeftModel, PeftConfig

config = AutoConfig.from_pretrained('ybelkada/falcon-7b-sharded-bf16', token=access_tok)
peft_model_id = 'dev7halo/falcon-7b-sharded-bf16-KoAlpaca'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    config._name_or_path,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)

model = model.to('cuda')
model.eval()

In [None]:
inputs = tokenizer("광해군은 폭군이었나요 ?", return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=756)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

#### Ko-LLM 4 

In [None]:
model_id = "davidkim205/komt-mistral-7b-v1"
config = AutoConfig.from_pretrained('davidkim205/komt-mistral-7b-v1', token=access_tok)
model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map={"": 0}, torch_dtype=torch.float16, low_cpu_mem_usage=True
)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextStreamer, GenerationConfig

model_name='davidkim205/komt-mistral-7b-v1'
model = AutoModelForCausalLM.from_pretrained(model_name) # , device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
streamer = TextStreamer(tokenizer)

In [None]:
g_device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model.to(g_device)

In [None]:
g_device

In [None]:
def gen(x):
    generation_config = GenerationConfig(
        temperature=0.8,
        top_p=0.8,
        top_k=100,
        max_new_tokens=1024,
        early_stopping=True,
        do_sample=True,
    )
    q = f"[INST]{x} [/INST]"
    gened = model.generate(
        **tokenizer(
            q,
            return_tensors='pt',
            return_token_type_ids=False
        ).to('cuda'),
        generation_config=generation_config,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        streamer=streamer,
    )
    result_str = tokenizer.decode(gened[0])

    start_tag = f"\n\n### Response: "
    start_index = result_str.find(start_tag)

    if start_index != -1:
        result_str = result_str[start_index + len(start_tag):].strip()
    return result_str

In [None]:
import numpy as np
txt = '금리가 물가에 미치는 영향을 설명해주세요'
gen(txt)

#### Ko-LLM 5 

In [None]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("kakaobank/kf-deberta-base")
tokenizer = AutoTokenizer.from_pretrained("kakaobank/kf-deberta-base")

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.vocab  #  핑개 -> UNKONW 

#### Ko-LLM 6

In [None]:
model_id = "amphora/olaf-v.42.0.2"
config = AutoConfig.from_pretrained(model_id, token=access_tok)
model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map={"": 0}, torch_dtype=torch.float16, low_cpu_mem_usage=True
)

In [None]:
model

In [None]:
from transformers import XLMRobertaForCausalLM, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

input_str = "장 전체가 폭락한 가운데 삼성전자만 상승세를 이어갔다. </s> 삼성전자"
input = tokenizer(input_str, return_tensors='pt')
output =model.generate(**input, max_length=20)

In [None]:
output

In [None]:
tokenizer.decode(output[0])

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.vocab

In [None]:
vocabs = {d: k for k, d in tokenizer.vocab.items()}
vocabs

In [None]:
vocabs[135644]

In [None]:
from transformers import XLMRobertaForCausalLM, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("amphora/olaf-v.42.0.2")
model = AutoModelForCausalLM.from_pretrained("amphora/olaf-v.42.0.2")

input_str = "금리가 물가에 미치는 영향을 설명해주세요"
input = tokenizer(input_str, return_tensors='pt')
output =model.generate(**input, max_length=200)

In [None]:
tokenizer.decode(output[0])