In [None]:
pip install torch transformers

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, GenerationConfig, pipeline
import pandas as pd
from tqdm import tqdm

#### json to csv (시험지 생성)

In [None]:
def convert_json_to_csv(json_file_path, csv_file_path):
    with open(json_file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    rows = []

    for item in data:

        conversation = item["input"]["conversation"]
        category = item["input"]["subject_keyword"]
        output = ""

        conversation_text = "[Conversation]\n"
        for utterance in conversation:
            speaker = f"화자{utterance['speaker']}"
            text = utterance["utterance"]
            conversation_text += f"{speaker}: {text}\n"

            category_text = ", ".join(category)

            question_text = (
                f"\n[Question]\n위 {category_text} 주제에 대한 대화를 요약해주세요."
            )

        instruction_text = conversation_text + question_text

        rows.append({"instruction": instruction_text, "output": output})

    df = pd.DataFrame(rows)
    df.to_csv(csv_file_path, index=False, encoding="utf-8-sig")

In [None]:
# JSON 데이터 파일 경로
json_file_path = "./data/일상대화요약_test.json"
# 결과를 저장할 CSV 파일 경로
csv_file_path = "./data/일상대화요약_시험지.csv"

convert_json_to_csv(json_file_path, csv_file_path)

### 1. 일반 추론

In [None]:
BASE_MODEL = "cpm-ai/Gemma2-Malpyung-DailyConversationSummary-NA"

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL,
                                             torch_dtype=torch.float16,
                                             device_map={"":0},
                                            )
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
streamer = TextStreamer(tokenizer)

In [None]:
df = pd.read_csv('./data/일상대화요약_시험지.csv')
output=[]
for index, instruction in tqdm(df['instruction'].items()):
    
    print(f"처리 중인 행: {index}")
    
    text = instruction

    chat = [
    { "role": "user", "content": f"""{text}""" }
    ]

    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    outputs = model.generate(input_ids=inputs.to(model.device), 
                             max_new_tokens=1024,
                             do_sample=False,
                             streamer=streamer
                            )

    input_length = inputs.shape[1]
    model_output = outputs[0][input_length:]
    decoded_output = tokenizer.decode(model_output, skip_special_tokens=True)
    cleaned_text = decoded_output.rstrip('\n')
    
    response = cleaned_text

    print('output')
    print(response)
    
    output.append(response)

df['output'] = output

# 답지 저장
df.to_csv('./data/일상대화요약_답지.csv', index=False, encoding='utf-8-sig')

### 2. Vllm 추론 (더 빠른 추론 속도)

In [None]:
pip install vllm

In [6]:
from vllm import LLM, SamplingParams
import torch
import pandas as pd
from tqdm import tqdm
import os
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'

In [None]:
llm = LLM(
    model="cpm-ai/Gemma2-Malpyung-DailyConversationSummary-NA", 
    max_model_len=4096,
    # gpu_memory_utilization=0.3, # gpu 메모리 사용량
    enforce_eager=True,
)

In [None]:
df = pd.read_csv('./data/일상대화요약_시험지.csv')
sampling_params = SamplingParams(temperature=0, max_tokens=1024)
output=[]

for index, instruction in tqdm(df['instruction'].items()):

    print(f"처리 중인 행: {index}")
    
    text = instruction
    
    formatted_prompt = f"<bos><start_of_turn>user\n{text}<end_of_turn>\n<start_of_turn>model\n'"
    
    # 답변 생성
    outputs = llm.generate(formatted_prompt, sampling_params)
    print('outputs확인')
    print(outputs[0].outputs[0].text)
    
    # 생성된 텍스트 추출
    response = outputs[0].outputs[0].text.strip()

    output.append(response)

df['output'] = output

# 답지 저장
df.to_csv('./data/일상대화요약_답지.csv', index=False, encoding='utf-8-sig')

### 제출용 답지 생성 (csv to json)

In [None]:
import pandas as pd
import json

In [None]:
# 답지 경로
csv_file = './data/일상대화요약_답지.csv'
df = pd.read_csv(csv_file)

# JSON 파일 읽기
json_file = './data/일상대화요약_test.json'
with open(json_file, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# CSV의 'output' 열 데이터를 JSON에 삽입
for i, item in enumerate(json_data):
    if i < len(df):
        item['output'] = df.loc[i, 'output']

# 수정된 JSON 데이터를 파일에 쓰기
output_json_file = './data/result.json'  # 출력 JSON 파일 경로를 지정하세요
with open(output_json_file, 'w', encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

print("작업이 완료되었습니다.")