In [1]:
# !pip install torch==2.1.0
# !pip install transformers==4.34.1
# !pip install bitsandbytes
# !pip install peft==0.6.0
# !pip install datasets
# !pip install evaluate
# !pip install accelerate
# !pip install sentencepiece
# !pip install einops
# !pip install scikit-learn
# !pip install ipdb

In [2]:
from collections import defaultdict
import copy
import json
import os
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence
import numpy as np
from tqdm import tqdm
import logging
import bitsandbytes as bnb
import pandas as pd
import importlib

import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import argparse
from transformers import (
    set_seed,
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    LlamaTokenizer

)
from datasets import load_dataset, Dataset
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

from accelerate import notebook_launcher
from accelerate import Accelerator
from torch.utils.data import DataLoader
from data_process import get_prompt, prepare_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Global variables
FROM_COLAB = False
DEBUG = False
ROOT_PATH = './'
str_args = None

In [4]:
if FROM_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = 'drive/MyDrive/Colab Notebooks/ADL/HW3/'
if DEBUG:
    import ipdb

In [5]:
# Comment out when using .py file
str_args = [
    "--test_file", "eval_data/-5lKAOBHZgs.json",
    "--model_name_or_path", "/home/vr/disk/YuSean/ADL/HW3/Taiwan-LLM-7B-v2.0-chat",
    "--peft_model", "output/checkpoint-100",
    "--test_size", "50",
    "--batch_size", "4",
    "--output_path", "./prediction.json",
    "--do_sample",
    "--num_beams", "1",
    "--top_k", "50",
]

In [6]:
@dataclass
class ModelArguments:
    test_file: str
    model_name_or_path: str
    peft_model: str
    seed: int = field(default = 42)
    num_video_per_channel: int = field(default=1)
    test_size: Optional[int] = field(default=None)
    output_path: str = field(default='./prediction.json')
@dataclass
class TrainingArguments:
    batch_size: int = field(default=2)
    source_max_len: int = field(default=1024)
    target_max_len: int = field(default=256)
    
@dataclass
class GenerationArguments:
    max_new_tokens: int = field(default=256),
    min_new_tokens: int = field(default=None),
    do_sample: bool = field(default=False),
    num_beams: Optional[int] = field(default=1),
    num_beam_groups: Optional[int] = field(default=1)
    temperature: Optional[float] = field(default=None)
    top_k: Optional[int] = field(default=None)
    top_p: Optional[float] = field(default=None)
# Parser
def parse_generation_args(str_args = None):
    '''
    There is something buggy using dataclass for generation config. Use standard parser to parse.
    Error Message: "TypeError: cannot pickle 'mappingproxy' object" 
    '''
    parser = argparse.ArgumentParser()
    # Generation Argument
    parser.add_argument(
        "--max_new_tokens",
        type=int,
        default=256
    )
    parser.add_argument(
        "--min_new_tokens",
        type=int,
        default=None
    )
    parser.add_argument(
        "--do_sample",
        action='store_true'
    )
    parser.add_argument(
        "--num_beams",
        type=int,
        default=1
    )
    parser.add_argument(
        "--num_beam_groups",
        type=int,
        default=1
    )
    parser.add_argument(
        "--temperature",
        type=float,
        default=None
    )
    parser.add_argument(
        "--top_k",
        type=int,
        default=None
    )
    parser.add_argument(
        "--top_p",
        type=float,
        default=None
    )

    args = parser.parse_args(str_args)
    return args

In [7]:
@dataclass
class DataCollatorForCausalLM(object):
    tokenizer: transformers.PreTrainedTokenizer
    source_max_len: int

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        IGNORE_INDEX = -100
        sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input for causal LM
        input_ids = []
        for tokenized_source in tokenized_sources_with_prompt['input_ids']:
                input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
        }
        return data_dict

In [8]:
def get_bnb_config() -> BitsAndBytesConfig:
    '''Get the BitsAndBytesConfig.'''
    bnb_config = BitsAndBytesConfig(
            load_in_4bit= True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False,
            bnb_4bit_quant_type="nf4"
    )
    return bnb_config


In [9]:
def main(str_args = None):
    pass

hfparser = transformers.HfArgumentParser((
    ModelArguments, TrainingArguments
))
model_args, training_args, extra_args = \
    hfparser.parse_args_into_dataclasses(str_args,return_remaining_strings=True)
generation_args = parse_generation_args(extra_args)
args = argparse.Namespace(
    **vars(model_args), **vars(training_args), **vars(generation_args)
)

In [10]:
# Prepare
logger = logging.getLogger(__name__)
compute_dtype = torch.float16
if args.seed is not None:
    set_seed(args.seed)
if args.output_path is not None:
    output_dir = os.path.join(*args.output_path.split("/")[:-1])
    os.makedirs(output_dir, exist_ok=True)


In [11]:
# Load dataset
print('Load Dataset')
def format_dataset(dataset):
    def processing(example):
        return {'input': get_prompt(example['video_title'], example['video_description'], example['star_num'], example['mood']),
                'output': example['comment_text']}
    formatted_dataset = dataset.map(processing)
    # Remove unused columns.
    formatted_dataset = formatted_dataset.remove_columns(
        [col for col in dataset.column_names if col in ['output']]
    )
    return formatted_dataset

raw_dataset = prepare_dataset(args.test_file, num_video_per_channel=None, select=False, seed=args.seed)
if args.test_size is not None and raw_dataset.shape[0] > args.test_size:
    raw_dataset = raw_dataset.select(range(args.test_size))
test_dataset = format_dataset(raw_dataset)

Load Dataset


Map: 100%|██████████| 50/50 [00:00<00:00, 5661.09 examples/s]


In [12]:
# Load Model
print('Load Model')
bnb_config = get_bnb_config()
base_model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    quantization_config = bnb_config,
    load_in_4bit = True,
    torch_dtype=compute_dtype,
    device_map = 'cuda:0'
)
base_model.config.torch_dtype=compute_dtype
# Load PeftModel
print("Loading adapters.")
model = PeftModel.from_pretrained(base_model, args.peft_model)

Load Model


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.68s/it]


Loading adapters.


In [13]:
# Load Tokenizer
print('Load Tokenizer')
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    padding_side="right",
    use_fast=False,
    tokenizer_type='llama'
)
tokenizer.add_special_tokens({
    "eos_token": tokenizer.convert_ids_to_tokens(base_model.config.eos_token_id),
    "bos_token": tokenizer.convert_ids_to_tokens(base_model.config.bos_token_id),
    "unk_token": tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id),
})

Load Tokenizer


0

In [14]:
# Data Collator
data_collator = DataCollatorForCausalLM(
    tokenizer=tokenizer,
    source_max_len=args.source_max_len
)
# Generatrion Config
gen_config = transformers.GenerationConfig(
    max_new_tokens = args.max_new_tokens,
    min_new_tokens = args.min_new_tokens,
    do_sample = args.do_sample,
    num_beams = args.num_beams,
    num_beam_groups = args.num_beam_groups,
    top_k = args.top_k,
    top_p = args.top_p,
    temperature = args.temperature,
    )


In [15]:
test_dataloader = DataLoader(test_dataset, collate_fn=data_collator, batch_size=args.batch_size)
progress = tqdm(total=len(test_dataloader))
model.eval()
all_predictions=[]
for step, batch in enumerate(test_dataloader):
    with torch.no_grad():
        predictions = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            generation_config = gen_config,
        )
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        predictions = tokenizer.batch_decode(
            predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        all_predictions += predictions
        progress.update()


100%|██████████| 13/13 [02:01<00:00, 10.17s/it]

In [16]:
with open(args.output_path, 'w') as fout:
    outputs = []
    for i, example in enumerate(test_dataset):
        output_example = {}
        output_example['cid'] = example['cid']
        output_example['mood'] = example['mood']
        output_example['output'] = all_predictions[i].replace(example['input'], '').strip()
        outputs.append(output_example)
    fout.write(json.dumps(outputs,indent=4,ensure_ascii=False))