In [1]:
!pip install pandas

In [None]:
!pip install sentence-transformers

In [None]:
!pip install langchain

In [4]:
!pip install langchain-community

In [None]:
!pip install faiss-gpu

In [28]:
import pandas as pd

In [16]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount("/content/data")

Mounted at /content/data


In [2]:
import easydict
args = easydict.EasyDict()

# path
args.default_path = "/content/data/MyDrive/Playdata/Competitions/ML/HanSol/Data/" # 메인 경로
args.train_path = args.default_path + "train.csv"
args.test_path = args.default_path + "test.csv"
args.sample_submission_path = args.default_path + "sample_submission.csv"

- bitsandbytes: Bitsandbytes는 CUDA 사용자 정의 함수, 특히 8비트 최적화 프로그램, 행렬 곱셈(LLM.int8()) 및 양자화 함수에 대한 경량 래퍼
- PEFT(Parameter-Efficient Fine-Tuning): 모델의 모든 매개변수를 미세 조정하지 않고도 사전 훈련된 PLM(언어 모델)을 다양한 다운스트림 애플리케이션에 효율적으로 적용 가능
- accelerate: PyTorch 모델을 더 쉽게 여러 컴퓨터나 GPU에서 사용할 수 있게 해주는 도구

In [3]:
#양자화에 필요한 패키지 설치
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone


## 2단계 - 트랜스포머에서 BitsandBytesConfig를 통해 양자화 매개변수 정의하기


* load_in_4bit=True: 모델을 4비트 정밀도로 변환하고 로드하도록 지정
* bnb_4bit_use_double_quant=True: 메모리 효율을 높이기 위해 중첩 양자화를 사용하여 추론 및 학습
* bnd_4bit_quant_type="nf4": 4비트 통합에는 2가지 양자화 유형인 FP4와 NF4가 제공됨. NF4 dtype은 Normal Float 4를 나타내며 QLoRA 백서에 소개되어 있습니다. 기본적으로 FP4 양자화 사용
* bnb_4bit_compute_dype=torch.bfloat16: 계산 중 사용할 dtype을 변경하는 데 사용되는 계산 dtype. 기본적으로 계산 dtype은 float32로 설정되어 있지만 계산 속도를 높이기 위해 bf16으로 설정 가능



In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

## 3단계 - 경량화 모델 로드하기
-  모델 ID를 지정한 다음 이전에 정의한 양자화 구성으로 로드

In [5]:
model_id = "kyujinpy/Ko-PlatYi-6B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.28M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [6]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(78464, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (n

## 4단계 - 잘 실행되는지 확인

In [7]:
device = "cuda:0"

messages = [
    {"role": "user", "content": "은행의 기준 금리에 대해서 설명해줘"}
]


encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)


generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



<|startoftext|> [INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

은행의 기준 금리에 대해서 설명해줘 [/INST]

[INST]<<SAMPLE QUESTION>> 은행의 기준 금리에 대해 설명해 달라 [/INST]

[SAMPLE QUESTION] 금리 - 은행 금리가 어떻게 설정되었는지 알고 싶습니다.

은행에서 이자를 제공하므로 금리의 영향을 받을 수 있습니다. 은행이 금리에 대해 어떻게 결정을 내리는지에 대한 정보가 부족합니다. 은행은 예금 금리를 정할 수 있는 더 많은 재정적 및 법적 선택권이 있는 경우가 많지만 모든 은행이 이러한 선택권을 가진 것은 아닙니다.

금리에 대한 자세한 내용은 다음을 참조하십시오: "은행의 금리와 이율 결정".

만일 은행 또는 연방 준비 제도(FRB)와 같은 중앙 관리가 아닌 다른 곳에서 기준금리가 결정된다고 말한다면 이는 사실과 다릅니다. 기준 금리는 연방 준비 제도(FRB) 내의 연방 통화 위원회의 회의를 통해 결정됩

## 5단계- RAG 시스템 결합하기

In [8]:
# pip install시 utf-8, ansi 관련 오류날 경우 필요한 코드
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [9]:
!pip -q install langchain pypdf chromadb sentence-transformers faiss-gpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.1/41.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import pipeline
from langchain.chains import LLMChain

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    return_full_text=True,
    max_new_tokens=300,
)

prompt_template = """
### [INST]
Instruction: Answer the question based on your knowledge.
Here is context to help:

{context}

### QUESTION:
{question}

[/INST]
 """

koplatyi_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=koplatyi_llm, prompt=prompt)

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
# from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.schema.runnable import RunnablePassthrough

In [17]:
loader = CSVLoader(file_path=args.train_path,encoding='utf-8')
data = loader.load()

In [18]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)

In [19]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "jhgan/ko-sbert-nli"
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

db = FAISS.from_documents(texts, hf)
retriever = db.as_retriever(
                            search_type="similarity",
                            search_kwargs={'k': 3}
                        )

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/538 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)


In [39]:
result = rag_chain.invoke("도배지에 녹은 자국이 발생하는 주된 원인과 그 해결 방법은 무엇인가요?")
# for i in result['context']:
#     print(f"주어진 근거: {i.page_content} / 출처: {i.metadata['source']} - {i.metadata['row']} \n\n")

print(result['text'].replace("\n\n", " ").replace("\n", " ").replace("답변:", ""))



  도배지에 녹이 묻어나오는 주된 원인은 높은 습도, 누수, 곰팡이입니다. 1. 높은 습도: 높은 습도로 인해 도배지 안쪽의 금속의 녹이 도배지에 베어나올 수 있습니다. 책임소재: 건물의 소유자나 거주자가 습기 관리의 책임이 있습니다. 해결 방법: 제습기 가동, 환기를 통해 실내 적정 습도를 유지하고 전문가의 도움을 받아 보수작업을 하는 것을 추천합니다. 2. 누수: 누수에 의해 도배지가 젖어 있는 상태가 지속되면 곰팡이가 발생할 수 있습니다. 책임소재: 건물의 소유자나 거주자가 책임이 있습니다. 해결 방법: 보수작업을 통해 누수를 제거하고 곰팡이가 발생한 도배지의 부분 및 전체를 교체해야 합니다. 해당 작업은 개인이 하기 어려운 작업이니 전문가의 도움을 받는 것을 추천합니다.


In [29]:
test_df = pd.read_csv(args.test_path)

In [None]:
test_df

In [31]:
from tqdm import tqdm

In [41]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
result = []

for i in tqdm(range(len(test_df))):
  _id = test_df.at[i,'id']
  _q = test_df.at[i,'질문']
  _a = []
  rst = rag_chain.invoke(_q)
  result.append(
      {
          "id":_id,
          "대답":rst['text'].replace("\n\n", " ").replace("\n", " ").replace("답변:", "")
      }
  )
  print()

  1%|          | 1/130 [00:31<1:07:51, 31.56s/it]




  2%|▏         | 2/130 [00:53<54:44, 25.66s/it]  




  2%|▏         | 3/130 [01:08<44:39, 21.10s/it]




  3%|▎         | 4/130 [01:40<53:16, 25.37s/it]




  4%|▍         | 5/130 [01:51<41:43, 20.03s/it]




  5%|▍         | 6/130 [02:05<37:41, 18.24s/it]




  5%|▌         | 7/130 [02:37<46:19, 22.59s/it]




  6%|▌         | 8/130 [02:50<39:25, 19.39s/it]




  7%|▋         | 9/130 [03:22<47:14, 23.42s/it]




  8%|▊         | 10/130 [03:30<37:22, 18.69s/it]




  8%|▊         | 11/130 [03:38<30:49, 15.54s/it]




  9%|▉         | 12/130 [04:10<40:25, 20.56s/it]




 10%|█         | 13/130 [04:23<35:33, 18.23s/it]




 11%|█         | 14/130 [04:36<31:47, 16.44s/it]




 12%|█▏        | 15/130 [05:08<40:28, 21.12s/it]




 12%|█▏        | 16/130 [05:22<36:26, 19.18s/it]




 13%|█▎        | 17/130 [05:50<40:48, 21.66s/it]




 14%|█▍        | 18/130 [06:22<46:31, 24.93s/it]




 15%|█▍        | 19/130 [06:54<49:55, 26.99s/it]




 15%|█▌        | 20/130 [07:03<39:25, 21.50s/it]




 16%|█▌        | 21/130 [07:25<39:26, 21.71s/it]




 17%|█▋        | 22/130 [07:39<35:05, 19.49s/it]




 18%|█▊        | 23/130 [08:03<37:14, 20.88s/it]




 18%|█▊        | 24/130 [08:19<34:02, 19.27s/it]




 19%|█▉        | 25/130 [08:40<34:30, 19.72s/it]




 20%|██        | 26/130 [09:11<40:14, 23.21s/it]




 21%|██        | 27/130 [09:37<41:15, 24.04s/it]




 22%|██▏       | 28/130 [10:10<45:30, 26.77s/it]




 22%|██▏       | 29/130 [10:26<39:38, 23.55s/it]




 23%|██▎       | 30/130 [10:58<43:30, 26.10s/it]




 24%|██▍       | 31/130 [11:08<34:52, 21.13s/it]




 25%|██▍       | 32/130 [11:14<27:08, 16.62s/it]




 25%|██▌       | 33/130 [11:35<29:02, 17.97s/it]




 26%|██▌       | 34/130 [11:53<29:01, 18.15s/it]




 27%|██▋       | 35/130 [12:01<23:30, 14.85s/it]




 28%|██▊       | 36/130 [12:09<20:16, 12.94s/it]




 28%|██▊       | 37/130 [12:18<18:14, 11.77s/it]




 29%|██▉       | 38/130 [12:24<15:32, 10.13s/it]




 30%|███       | 39/130 [12:48<21:23, 14.10s/it]




 31%|███       | 40/130 [12:58<19:31, 13.02s/it]




 32%|███▏      | 41/130 [13:31<28:09, 18.98s/it]




 32%|███▏      | 42/130 [13:49<27:20, 18.65s/it]




 33%|███▎      | 43/130 [14:10<28:03, 19.35s/it]




 34%|███▍      | 44/130 [14:42<33:12, 23.17s/it]




 35%|███▍      | 45/130 [14:51<26:50, 18.95s/it]




 35%|███▌      | 46/130 [15:14<28:02, 20.03s/it]




 36%|███▌      | 47/130 [15:46<32:34, 23.55s/it]




 37%|███▋      | 48/130 [16:03<29:47, 21.80s/it]




 38%|███▊      | 49/130 [16:36<33:46, 25.02s/it]




 38%|███▊      | 50/130 [17:08<36:23, 27.29s/it]




 39%|███▉      | 51/130 [17:17<28:21, 21.54s/it]




 40%|████      | 52/130 [17:24<22:29, 17.30s/it]




 41%|████      | 53/130 [17:42<22:22, 17.43s/it]




 42%|████▏     | 54/130 [17:50<18:44, 14.79s/it]




 42%|████▏     | 55/130 [18:08<19:35, 15.67s/it]




 43%|████▎     | 56/130 [18:30<21:41, 17.59s/it]




 44%|████▍     | 57/130 [19:02<26:35, 21.85s/it]




 45%|████▍     | 58/130 [19:10<21:24, 17.83s/it]




 45%|████▌     | 59/130 [19:17<17:11, 14.53s/it]




 46%|████▌     | 60/130 [19:33<17:15, 14.80s/it]




 47%|████▋     | 61/130 [19:48<17:22, 15.10s/it]




 48%|████▊     | 62/130 [20:09<19:03, 16.82s/it]




 48%|████▊     | 63/130 [20:40<23:26, 21.00s/it]




 49%|████▉     | 64/130 [21:10<25:59, 23.62s/it]




 50%|█████     | 65/130 [21:38<27:03, 24.98s/it]




 51%|█████     | 66/130 [21:51<22:42, 21.29s/it]




 52%|█████▏    | 67/130 [22:22<25:37, 24.40s/it]




 52%|█████▏    | 68/130 [22:32<20:46, 20.10s/it]




 53%|█████▎    | 69/130 [22:38<16:06, 15.85s/it]




 54%|█████▍    | 70/130 [22:48<14:00, 14.01s/it]




 55%|█████▍    | 71/130 [23:13<16:55, 17.21s/it]




 55%|█████▌    | 72/130 [23:21<14:02, 14.52s/it]




 56%|█████▌    | 73/130 [23:43<16:05, 16.94s/it]




 57%|█████▋    | 74/130 [23:51<13:04, 14.01s/it]




 58%|█████▊    | 75/130 [24:23<17:59, 19.63s/it]




 58%|█████▊    | 76/130 [24:56<21:07, 23.48s/it]




 59%|█████▉    | 77/130 [25:20<20:55, 23.70s/it]




 60%|██████    | 78/130 [25:53<22:53, 26.42s/it]




 61%|██████    | 79/130 [26:04<18:31, 21.79s/it]




 62%|██████▏   | 80/130 [26:16<15:46, 18.94s/it]




 62%|██████▏   | 81/130 [26:28<13:42, 16.79s/it]




 63%|██████▎   | 82/130 [26:42<12:50, 16.05s/it]




 64%|██████▍   | 83/130 [26:50<10:38, 13.58s/it]




 65%|██████▍   | 84/130 [27:10<12:00, 15.66s/it]




 65%|██████▌   | 85/130 [27:19<10:02, 13.38s/it]




 66%|██████▌   | 86/130 [27:36<10:42, 14.60s/it]




 67%|██████▋   | 87/130 [28:07<14:02, 19.58s/it]




 68%|██████▊   | 88/130 [28:18<11:56, 17.05s/it]




 68%|██████▊   | 89/130 [28:33<11:09, 16.32s/it]




 69%|██████▉   | 90/130 [28:49<10:46, 16.17s/it]




 70%|███████   | 91/130 [28:57<09:01, 13.87s/it]




 71%|███████   | 92/130 [29:29<12:14, 19.32s/it]




 72%|███████▏  | 93/130 [29:47<11:38, 18.87s/it]




 72%|███████▏  | 94/130 [30:07<11:35, 19.31s/it]




 73%|███████▎  | 95/130 [30:39<13:19, 22.85s/it]




 74%|███████▍  | 96/130 [30:49<10:46, 19.01s/it]




 75%|███████▍  | 97/130 [31:02<09:30, 17.29s/it]




 75%|███████▌  | 98/130 [31:16<08:46, 16.44s/it]




 76%|███████▌  | 99/130 [31:49<11:02, 21.38s/it]




 77%|███████▋  | 100/130 [32:10<10:37, 21.26s/it]




 78%|███████▊  | 101/130 [32:19<08:25, 17.44s/it]




 78%|███████▊  | 102/130 [32:52<10:21, 22.19s/it]




 79%|███████▉  | 103/130 [33:24<11:14, 24.99s/it]




 80%|████████  | 104/130 [33:47<10:35, 24.42s/it]




 81%|████████  | 105/130 [34:00<08:49, 21.19s/it]




 82%|████████▏ | 106/130 [34:34<09:55, 24.82s/it]




 82%|████████▏ | 107/130 [34:49<08:29, 22.15s/it]




 83%|████████▎ | 108/130 [35:03<07:10, 19.57s/it]




 84%|████████▍ | 109/130 [35:15<06:01, 17.20s/it]




 85%|████████▍ | 110/130 [35:30<05:35, 16.77s/it]




 85%|████████▌ | 111/130 [36:03<06:50, 21.60s/it]




 86%|████████▌ | 112/130 [36:36<07:26, 24.80s/it]




 87%|████████▋ | 113/130 [37:06<07:28, 26.36s/it]




 88%|████████▊ | 114/130 [37:28<06:44, 25.28s/it]




 88%|████████▊ | 115/130 [37:39<05:14, 20.94s/it]




 89%|████████▉ | 116/130 [38:12<05:40, 24.35s/it]




 90%|█████████ | 117/130 [38:44<05:47, 26.73s/it]




 91%|█████████ | 118/130 [38:56<04:27, 22.31s/it]




 92%|█████████▏| 119/130 [39:19<04:07, 22.47s/it]




 92%|█████████▏| 120/130 [39:54<04:22, 26.25s/it]




 93%|█████████▎| 121/130 [40:09<03:27, 23.02s/it]




 94%|█████████▍| 122/130 [40:41<03:24, 25.54s/it]




 95%|█████████▍| 123/130 [40:51<02:27, 21.13s/it]




 95%|█████████▌| 124/130 [41:16<02:13, 22.28s/it]




 96%|█████████▌| 125/130 [41:26<01:32, 18.55s/it]




 97%|█████████▋| 126/130 [41:42<01:10, 17.71s/it]




 98%|█████████▊| 127/130 [41:49<00:43, 14.50s/it]




 98%|█████████▊| 128/130 [42:00<00:26, 13.43s/it]




 99%|█████████▉| 129/130 [42:10<00:12, 12.47s/it]




100%|██████████| 130/130 [42:21<00:00, 19.55s/it]







In [43]:
from sentence_transformers import SentenceTransformer

In [44]:
_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [45]:
for i in range(len(result)):
  result[i]['embedding'] = _model.encode(result[i]['대답'].replace("\u200b"," "))

In [46]:
submission = []

for i in range(len(result)):
  tmp = {"id":result[i]['id'],}
  for j in range(len(result[i]['embedding'])):
    tmp[f'vec_{j}'] = result[i]['embedding'][j]
  submission.append(
      tmp
  )

In [47]:
pd.DataFrame(submission).to_csv("submission_byseul.csv",index=False)