In [1]:
from collections import defaultdict
import copy
import json
import os
import gc
import warnings
import threading
import time
import datetime
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence, Tuple, Union
import numpy as np
from tqdm import tqdm
import logging
import pandas as pd
import importlib
from packaging import version
from packaging.version import parse
import argparse

import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    set_seed,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    LlamaTokenizer
)
from transformers.activations import ACT2FN

import bitsandbytes as bnb
from datasets import load_dataset, load_from_disk, Dataset
import evaluate

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

import deepspeed

os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

'''
param
'''

# llama2chat = "/hpc2hdd/home/lzhang330/ssd_workspace/models/llama-2-7b-chat-hf"
# llama2 = "/hpc2hdd/home/lzhang330/ssd_workspace/models/Llama-2-7b-hf"
llama = "/mnt/sdb/zhanglongteng/data2/share/llama-1/llama-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(
    llama,
    padding_side="right",
    use_fast=False, # Fast tokenizer giving issues.
    tokenizer_type='llama', # Needed for HF name change
)

abcd_idx = [
    tokenizer("A").input_ids[1],
    tokenizer("B").input_ids[1],
    tokenizer("C").input_ids[1],
    tokenizer("D").input_ids[1],
]

print(abcd_idx)


  from .autonotebook import tqdm as notebook_tqdm


[2024-04-12 15:26:40,149] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[319, 350, 315, 360]


In [None]:

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

@dataclass
class DataCollatorForCausalLM(object):
    tokenizer: transformers.PreTrainedTokenizer
    source_max_len: int
    target_max_len: int
    train_on_source: bool
    predict_with_generate: bool
    hard_padding: bool

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
        targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            padding='max_length' if self.hard_padding else False,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = self.tokenizer(
            targets,
            padding='max_length' if self.hard_padding else False,
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input and labels for causal LM
        input_ids = []
        labels = []
        for tokenized_source, tokenized_target in zip(
            tokenized_sources_with_prompt['input_ids'],
            tokenized_targets['input_ids']
        ):
            if not self.predict_with_generate:
                input_ids.append(torch.tensor(tokenized_source + tokenized_target))
                if not self.train_on_source:
                    labels.append(
                        torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
                    )
                else:
                    labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
            else:
                input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None

        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
        }
        if labels is not None:
            data_dict['labels'] = labels
        return data_dict

def extract_super_natural_instructions_data(examples, extract_reformulations=False):
    out = {
        'input': [],
        'output': [],
    }
    print_rank_0(examples)
    for instance in examples:
        out['input'].append(instance['input'])
        out['output'].append(instance['output'])
    if extract_reformulations:
        for example_reformulations in examples['reformulations']:
            if example_reformulations is not None:
                for instance in example_reformulations:
                    out['input'].append(instance['instruction_with_input'])
                    out['output'].append(instance['output'])
    print_rank_0(out)
    return out

ALPACA_PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response: "
    ),
}

def extract_alpaca_dataset(example):
    if example.get("input", "") != "":
        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
    else:
        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
    return {'input': prompt_format.format(**example)}

def local_dataset(dataset_name):
    if dataset_name.endswith('.json') or dataset_name.endswith('.jsonl'):
        full_dataset = Dataset.from_json(path_or_paths=dataset_name)
    elif dataset_name.endswith('.csv'):
        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
    elif dataset_name.endswith('.tsv'):
        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
    else:
        raise ValueError(f"Unsupported dataset format: {dataset_name}")

    split_dataset = full_dataset.train_test_split(test_size=0.1)
    return split_dataset

def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
    """
    Make dataset and collator for supervised fine-tuning.
    Datasets are expected to have the following columns: { `input`, `output` }

    Available datasets to be selected with `dataset` argument:
        - alpaca, 52002 examples
        - alpaca cleaned, 51942 examples
        - chip2 (OIG), 210289 examples
        - self-instruct, 82612 examples
        - hh-rlhf (Anthropic), 160800 examples
        - longform, 23.7k examples
        - oasst1 (OpenAssistant) primary message tree only, 9,846 examples

    Coming soon:
        - unnatural instructions core, 66010 examples
        - unnatural instructions full, 240670 examples
        - alpaca-gpt4, 52002 examples
        - unnatural-instructions-gpt4, 9000 examples
        - supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used)
        - vicuna

    """
    def load_data(dataset_name, data_path=None):
        if data_path is None:
            if dataset_name == 'alpaca':
                return load_dataset("tatsu-lab/alpaca")
            elif dataset_name == 'alpaca-dummy':
                return load_dataset("Lohse/alpaca-dummy")
            elif dataset_name == 'alpaca-clean':
                return load_dataset("yahma/alpaca-cleaned")
            elif dataset_name == 'flanv2':
                return load_dataset("conceptofmind/FLAN_2022")
            elif dataset_name == 'chip2':
                return load_dataset("laion/OIG", data_files='unified_chip2.jsonl')
            elif dataset_name == 'self-instruct':
                return load_dataset("yizhongw/self_instruct", name='self_instruct')
            elif dataset_name == 'hh-rlhf':
                return load_dataset("Anthropic/hh-rlhf")
            elif dataset_name == 'longform':
                return load_dataset("akoksal/LongForm")
            elif dataset_name == 'oasst1':
                return load_dataset("timdettmers/openassistant-guanaco")
            elif dataset_name == 'vicuna':
                raise NotImplementedError("Vicuna data was not released.")
            else:
                if os.path.exists(dataset_name):
                    try:
                        args.dataset_format = args.dataset_format if args.dataset_format else "input-output"
                        full_dataset = local_dataset(dataset_name)
                        return full_dataset
                    except:
                        raise ValueError(f"Error loading dataset from {dataset_name}")
                else:
                    raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")
        else:
            if dataset_name in ['alpaca','alpaca-dummy','alpaca-clean','flanv2','hh-rlhf','longform','oasst1']:
                return load_dataset('json', data_dir=os.path.join(data_path,dataset_name))
            elif dataset_name == 'chip2':
                return load_dataset('json', data_dir=os.path.join(data_path,dataset_name), data_files='unified_chip2.jsonl')
            elif dataset_name == 'self-instruct':
                return load_dataset('json', data_dir=os.path.join(data_path,dataset_name), name='self_instruct')
            elif dataset_name == 'super-natural':
                return load_dataset('json', data_dir=os.path.join(data_path,dataset_name))
            elif dataset_name == 'vicuna':
                raise NotImplementedError("Vicuna data was not released.")
            else:
                if os.path.exists(dataset_name):
                    try:
                        args.dataset_format = args.dataset_format if args.dataset_format else "input-output"
                        full_dataset = local_dataset(dataset_name)
                        return full_dataset
                    except:
                        raise ValueError(f"Error loading dataset from {dataset_name}")
                else:
                    raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")

    def format_dataset(dataset, dataset_format):
        if (
            dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or dataset_format == 'alpaca-dummy' or
            (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean', 'alpaca-dummy'])
        ):
            dataset = dataset.map(extract_alpaca_dataset)
        elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'):
            dataset = dataset.map(lambda x: {
                'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''),
                'output': x['text'].split('\n<bot>: ')[1],
            })
        elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'):
            for old, new in [["prompt", "input"], ["completion", "output"]]:
                dataset = dataset.rename_column(old, new)
        elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
            dataset = dataset.map(lambda x: {
                'input': '',
                'output': x['chosen']
            })
        elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'):
            dataset = dataset.map(lambda x: {
                'input': '',
                'output': x['text'],
            })
        elif dataset_format == 'flanv2' or (dataset_format is None and args.dataset == 'flanv2'):
            dataset = dataset.map(lambda x: {'input': x['inputs'],'output': x['targets'],})
        elif dataset_format =='super-natural' or (dataset_format is None and args.dataset == 'super-natural'):
            dataset = dataset.map(remove_columns=['id'])
            # dataset = extract_super_natural_instructions_data(dataset)
            # dataset = Dataset.from_dict(dataset)
        elif dataset_format == 'input-output':
            # leave as is
            pass
        # Remove unused columns.
        dataset = dataset.remove_columns(
            [col for col in dataset.column_names['train'] if col not in ['input', 'output']]
        )
        return dataset
    
    dataset = load_data(args.dataset, args.data_path)
    dataset = format_dataset(dataset, args.dataset_format)
    
    if args.do_eval or args.do_predict:
        if 'eval' in dataset:
            eval_dataset = dataset['eval']
        else:
            print_rank_0('Splitting train dataset in train and validation according to `eval_dataset_size`')
            dataset = dataset["train"].train_test_split(
                test_size=args.eval_dataset_size, shuffle=True, seed=42
            )
            eval_dataset = dataset['test']
        if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
            eval_dataset = eval_dataset.select(range(args.max_eval_samples))
        if args.group_by_length:
            eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
    if args.do_train:
        train_dataset = dataset['train']
        if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
            train_dataset = train_dataset.select(range(args.max_train_samples))
        if args.group_by_length:
            train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})

    data_collator = DataCollatorForCausalLM(
        tokenizer=tokenizer,
        source_max_len=args.source_max_len,
        target_max_len=args.target_max_len,
        train_on_source=args.train_on_source,
        predict_with_generate=args.predict_with_generate,
        hard_padding=args.hard_padding,
    )
    return dict(
        train_dataset=train_dataset if args.do_train else None,
        eval_dataset=eval_dataset if args.do_eval else None,
        predict_dataset=eval_dataset if args.do_predict else None,
        data_collator=data_collator
    )

In [3]:
main_path="/mnt/sdb/zhanglongteng/llm-toolkit"

llama2="/mnt/sdb/zhanglongteng/data2/share/zhanglongteng_A6000/Llama-2-7b-hf"

alpaca = load_dataset('json', data_dir=os.path.join(main_path,"datasets/alpaca"))

In [6]:
ALPACA_PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response: "
    ),
}

def extract_alpaca_dataset(example):
    if example.get("input", "") != "":
        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
    else:
        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
    return {'input': prompt_format.format(**example)}



Map: 100%|██████████| 52002/52002 [00:03<00:00, 15549.40 examples/s]


In [29]:
dataset_rm = alpaca.map(extract_alpaca_dataset, remove_columns=['instruction'])
dataset = alpaca.map(extract_alpaca_dataset)

Map: 100%|██████████| 52002/52002 [00:03<00:00, 14769.20 examples/s]


In [33]:
dataset_rm

DatasetDict({
    train: Dataset({
        features: ['output', 'input'],
        num_rows: 52002
    })
})

In [27]:
print(alpaca["train"]["instruction"][4])
print(alpaca["train"]["input"][4])
print(alpaca["train"]["instruction"][5])
print(alpaca["train"]["input"][5])
# alpaca["train"]["output"]

Describe a time when you had to make a difficult decision.

Identify the odd one out.
Twitter, Instagram, Telegram


DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input'],
        num_rows: 52002
    })
})

In [30]:
print(dataset["train"]["instruction"][4])
print(dataset["train"]["input"][4])
print(dataset["train"]["instruction"][5])
print(dataset["train"]["input"][5])

Describe a time when you had to make a difficult decision.
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Describe a time when you had to make a difficult decision.

### Response: 
Identify the odd one out.
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the odd one out.

### Input:
Twitter, Instagram, Telegram

### Response: 


In [34]:

print(dataset_rm["train"]["input"][4])

print(dataset_rm["train"]["input"][5])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Describe a time when you had to make a difficult decision.

### Response: 
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the odd one out.

### Input:
Twitter, Instagram, Telegram

### Response: 


In [2]:
def safe_dict2file(dictionary:Dict, filename):
    lock = threading.Lock()
    lock.acquire()
    with open(filename, 'a') as json_file:
        try:
            json.dump(dictionary, json_file, indent=4)
            json_file.write("\n")
        finally:
            lock.release()

In [3]:
tokenizer("A")

{'input_ids': [0, 319], 'attention_mask': [1, 1]}

In [4]:
import lm_eval
llama2 = "/mnt/sdb/zhanglongteng/data2/share/zhanglongteng_A6000/Llama-2-7b-hf"

device:str ='cuda'  # 'cuda' or 'cpu'
task_manager = lm_eval.tasks.TaskManager()

2024-04-08:16:12:54,635 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2024-04-08:16:12:54,636 INFO     [evaluator.py:177] Initializing hf model, with arguments: {'pretrained': '/mnt/sdb/zhanglongteng/data2/share/zhanglongteng_A6000/Llama-2-7b-hf', 'tokenizer': '/mnt/sdb/zhanglongteng/data2/share/zhanglongteng_A6000/Llama-2-7b-hf'}
2024-04-08:16:12:54,638 INFO     [huggingface.py:163] Using device 'cuda'
Loading checkpoint shards: 100%|██████████| 2/2 [00:45<00:00, 22.95s/it]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
2024-04-08:16:18:25,304 INFO     [task.py:395] Building contexts for mmlu_world_religions on rank 0...
100%|██████████| 171/171 [00:02<00:00, 63.42it/s]
2024-04-08:16:18:28,016 INFO     [task.py:395] Building contexts for mmlu_moral_disputes on r

2024-04-10:12:23:54,377 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2024-04-10:12:23:54,379 INFO     [evaluator.py:177] Initializing hf model, with arguments: {'pretrained': '/mnt/sdb/zhanglongteng/data2/share/zhanglongteng_A6000/Llama-2-7b-hf', 'tokenizer': '/mnt/sdb/zhanglongteng/data2/share/zhanglongteng_A6000/Llama-2-7b-hf'}
2024-04-10:12:23:54,386 INFO     [huggingface.py:163] Using device 'cuda'
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.30s/it]
2024-04-10:12:24:12,098 INFO     [task.py:395] Building contexts for gsm8k on rank 0...
100%|██████████| 1319/1319 [00:08<00:00, 156.46it/s]
2024-04-10:12:24:20,561 INFO     [evaluator.py:379] Running generate_until requests
Running generate_until requests:   1%|▏         | 17/1319 [02:05<2:11:03,  6.04s/it]