# -*- coding: utf-8 -*-
"""Optimized Korean Review Decoding Script
   Adjusted for Training Time Reduction on Colab with T4 GPU.
"""

In [1]:
# Install dependencies (if needed in Colab)
!pip install msal bitsandbytes msal_extensions
!pip install -U --upgrade accelerate transformers



In [2]:
import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from io import StringIO
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from msal import PublicClientApplication
from msal_extensions import FilePersistence, PersistedTokenCache
import requests
from accelerate import infer_auto_device_map
from transformers import logging
import os
from accelerate import infer_auto_device_map
from transformers import logging

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Dataset Loader Class
class ReviewDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]['input'], self.data.iloc[idx]['output']

In [4]:
# Optimize text processing
def remove_repeated_phrases(text: str) -> str:
    return "".join(dict.fromkeys(list(text))).replace("model\n", "")

In [5]:
def get_ms_token(client_id, authority, scopes):
    # 캐시 파일 경로 설정
    cache_file_path = os.path.expanduser('~/.msal_cache.json')

    # MSAL Extensions를 사용하여 파일 기반 캐시 생성
    persistence = FilePersistence(cache_file_path)
    token_cache = PersistedTokenCache(persistence)

    # MSAL 앱 생성
    app = PublicClientApplication(client_id, authority=authority, token_cache=token_cache)

    # 캐시에서 기존 계정 확인
    accounts = app.get_accounts()
    if accounts:
        # 첫 번째 계정 선택 (여러 계정이 있을 경우 적절히 선택)
        result = app.acquire_token_silent(scopes, account=accounts[0])
        if 'access_token' in result:
            print('캐시된 토큰을 사용합니다.')
        else:
            print('캐시에서 유효한 토큰을 찾을 수 없습니다. 인증을 진행합니다...')
    else:
        # 디바이스 코드 플로우를 통한 새 인증 진행
        flow = app.initiate_device_flow(scopes=scopes)
        if 'user_code' not in flow:
            raise ValueError('디바이스 플로우 생성에 실패했습니다. 설정을 확인하세요.')
        print(f"다음 URL로 이동하여 코드를 입력하세요: {flow['verification_uri']}")
        print(f"인증 코드: {flow['user_code']}")
        result = app.acquire_token_by_device_flow(flow)

    if 'access_token' in result:
        print('인증에 성공했습니다!')
        headers = {'Authorization': f"Bearer {result['access_token']}"}
        return headers
    else:
        print('인증에 실패했습니다.')
        return None

In [6]:
def get_csv_data(graph_api_url, headers):
    # 단어 사전 기반 복원 데이터셋 가져오기
    dataset_folder_id = '01UUMNEVON2CIOT46PFZHKLFI3QAGBYRUR'  # dataset 폴더
    dataset_folder_url = f"{graph_api_url}/me/drive/items/{dataset_folder_id}/children"
    response = requests.get(dataset_folder_url, headers=headers).json()

    files = response.get('value', [])
    file_ids = {file['name']: file['id'] for file in files}

    # 다운로드할 파일명 목록
    target_files = ['train.csv', 'test.csv', 'sample_submission.csv']
    responses = {}

    for file_name in target_files:
        file_id = file_ids.get(file_name)
        if file_id:
            file_url = f"{graph_api_url}/me/drive/items/{file_id}/content"
            file_response = requests.get(file_url, headers=headers)
            if file_response.status_code == 200:
                responses[file_name] = file_response
            else:
                print(f"Failed to download {file_name}: {file_response.status_code}")
        else:
            print(f"{file_name} not found in the specified folder.")

    return (responses.get('train.csv'),
            responses.get('test.csv'),
            responses.get('sample_submission.csv'))

In [7]:
# Load and preprocess data
def load_data(train_response, test_response, submission_response):
    train_csv_data = StringIO(train_response.text)
    test_csv_data = StringIO(test_response.text)
    submission_data = StringIO(submission_response.text)
    train = pd.read_csv(train_csv_data)
    test = pd.read_csv(test_csv_data)
    submission = pd.read_csv(submission_data)
    return train, test, submission

In [8]:
# Model and tokenizer initialization
def initialize_model(model_name='mindw96/Gemma-2-2B-it-DACON-LLM', device="cuda"):
    logging.set_verbosity_info()  # Enables detailed logging
    device_map = "auto"  # Default behavior, offloads to CPU if necessary

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map=device_map,
        )
        print("Model loaded successfully on GPU")
    except ValueError as e:
        print(f"Error: {e}")
        print("Retrying with device_map='cuda'...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="cuda",
        )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
    return model, tokenizer


In [9]:
# Function to decode reviews
def decode_reviews(model, tokenizer, test, samples, device="cuda"):
    restored_reviews = []
    samples = [{"input" : train['input'][i], "output" : train['output'][i]} for i in range(5)]
    system_prompt = f"You are a helpful assistant specializing in restoring obfuscated Korean reviews. \
					Your task is to transform the given obfuscated Korean review into a clear, correct,\
					and natural-sounding Korean review that reflects its original meaning.\
					Below are examples of obfuscated Korean reviews and their restored forms:\n\n \
					Example, {samples[0]} \n {samples[1]} \n {samples[2]} \n {samples[3]} \n {samples[4]} \
					Spacing and word length in the output must be restored to the same as in the input.\
					Do not provide any description. Print only in Korean."
    for _, row in tqdm(test.iterrows(), total=len(test)):
        query = row['input']
        messages = [
            {"role": "user", "content": '{}\ninput: {}, output:'.format(system_prompt, query)}
            ]
        # messages[0]['content'] = f"{system_prompt}\ninput: {query}, output:"
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
        # input_ids = tokenizer(messages[0]['content'], return_tensors="pt").to(device)

        outputs = model.generate(
            # input_ids['input_ids'],
            **input_ids,
            max_new_tokens=len(query),
            do_sample=False
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        result = generated_text[len(messages[0]['content']):].strip()
        restored_reviews.append(remove_repeated_phrases(result))

    return restored_reviews

In [10]:
# Save results to CSV
def save_results(restored_reviews, submission_df): # Changed sample_path to submission_df for clarity
    submission = submission_df.copy() # Create a copy to avoid modifying the original DataFrame
    submission['output'] = restored_reviews
    submission.to_csv('/result/submission1.csv', index=False, encoding='utf-8-sig')

In [11]:
  # Azure 앱 정보
  GRAPH_API_URL = 'https://graph.microsoft.com/v1.0'
  CLIENT_ID = 'ef053b61-d7f1-4942-97d4-bb79fa475a01'  # 앱 등록에서 가져온 클라이언트 ID
  AUTHORITY = 'https://login.microsoftonline.com/f09a4ef3-978d-434e-89da-a29b9f9f3c32'  # 테넌트 ID 또는 'common'
  SCOPES = ['Files.ReadWrite.All']  # 필요 권한 설정

In [12]:
train, test, submission = load_data(*get_csv_data(GRAPH_API_URL, get_ms_token(CLIENT_ID, AUTHORITY, SCOPES)))

캐시된 토큰을 사용합니다.
인증에 성공했습니다!


In [13]:
model, tokenizer = initialize_model()

loading configuration file config.json from cache at C:\Users\didie\.cache\huggingface\hub\models--mindw96--Gemma-2-2B-it-DACON-LLM\snapshots\1f67bdc6954d5ebe105e1dbaada905fc230c6690\config.json
Model config Gemma2Config {
  "_name_or_path": "mindw96/Gemma-2-2B-it-DACON-LLM",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096

Model loaded successfully on GPU


loading file tokenizer.model from cache at None
loading file tokenizer.json from cache at C:\Users\didie\.cache\huggingface\hub\models--mindw96--Gemma-2-2B-it-DACON-LLM\snapshots\1f67bdc6954d5ebe105e1dbaada905fc230c6690\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\didie\.cache\huggingface\hub\models--mindw96--Gemma-2-2B-it-DACON-LLM\snapshots\1f67bdc6954d5ebe105e1dbaada905fc230c6690\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\didie\.cache\huggingface\hub\models--mindw96--Gemma-2-2B-it-DACON-LLM\snapshots\1f67bdc6954d5ebe105e1dbaada905fc230c6690\tokenizer_config.json
loading file chat_template.jinja from cache at None


In [None]:
restored_reviews = decode_reviews(model, tokenizer, test, train)

  0%|          | 0/1689 [00:00<?, ?it/s]

In [29]:
save_results(restored_reviews, submission)