In [1]:
import os
import json
import time
from typing import Dict

import numpy as np
from tqdm import tqdm
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
DEFAULT_PAD_TOKEN = "<pad>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "<unk>"

In [None]:
def smart_tokenizer_and_embedding_resize(
        special_tokens_dict: Dict,
        tokenizer: transformers.PreTrainedTokenizer,
        model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.
    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) 
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data
        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

In [2]:
def load_model_and_tokenizer(path='/mntcephfs/data/med/zhihong/workspace/LLMZoo/llama_hf_7b'):
    model = AutoModelForCausalLM.from_pretrained(path)

    tokenizer = AutoTokenizer.from_pretrained(
        path, 
        model_max_length=2048, 
        padding_side="right", 
        use_fast=True
    )
    if tokenizer.pad_token is None:
        smart_tokenizer_and_embedding_resize(
            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
            tokenizer=tokenizer,
            model=model,
        )
    tokenizer.add_special_tokens({
        "eos_token": DEFAULT_EOS_TOKEN,
        "bos_token": DEFAULT_BOS_TOKEN,
        "unk_token": DEFAULT_UNK_TOKEN,
    })
    return tokenizer

tokenizer = load_model_and_tokenizer()

Loading checkpoint shards: 100%|██████████| 2/2 [00:32<00:00, 16.42s/it]
Using pad_token, but it is not set yet.


In [5]:
def read_json_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        json_data = json.load(file)
    return json_data

def extract_utters(path):
    all_utter = []
    session_list = read_json_file(path)
    for session_dict in session_list:
        for utter_dict in session_dict['conversations']:
            all_utter.append(utter_dict['value'])
    return all_utter

def compute_avg_utters(path):
    '''所有utter的token的length / 所有utter的数量'''
    utter_len = []
    all_utter = extract_utters(path)
    for utter in tqdm(all_utter):
        idx_tensor = tokenizer(utter, return_tensors="pt", padding="longest")['input_ids'][0]
        utter_len.append(len(idx_tensor)-1)
    return np.mean(utter_len)

def extract_conv(path):
    all_conv = []
    session_list = read_json_file(path)
    for session_dict in session_list:
        one_conv = ''
        for utter_dict in session_dict['conversations']:
            one_conv += (utter_dict['value'])
        all_conv.append(one_conv) 
    return all_conv

def compute_avg_convs(path):
    '''1session内所有utter的token的length / 多少个session'''
    all_conv = extract_conv(path)
    one_conv_len = []
    for one_conv_str in tqdm(all_conv):
        idx_tensor = tokenizer(one_conv_str, return_tensors="pt", padding="longest")['input_ids'][0]
        one_conv_len.append(len(idx_tensor)-1)
    # assert len(one_conv_len) == 10000
    return np.mean(one_conv_len)

In [6]:
directory = r"/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/hei"

files_name = []
for root, dirs, files in os.walk(directory):
    for file in files:
        files_name.append(os.path.join(root, file))

for file in tqdm(files_name):
    start = time.time()
    
    avg_utter_len = compute_avg_utters(file)
    avg_conv_len = compute_avg_convs(file)
    
    end = time.time()
    
    print(f'\nfor{file}:')
    print('avg_utter_len_by_token: ', avg_utter_len)
    print('avg_conv_len_by_token: ', avg_conv_len)
    print(f'Elapsed {end-start} seconds.\n')
    print()

100%|██████████| 75746/75746 [01:27<00:00, 861.66it/s]
100%|██████████| 10000/10000 [01:59<00:00, 83.91it/s]
 25%|██▌       | 1/4 [03:30<10:31, 210.43s/it]


for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/hei/ww_10k.json:
avg_utter_len_by_token:  278.812135294273
avg_conv_len_by_token:  2112.2579
Elapsed 210.4267475605011 seconds.




100%|██████████| 234356/234356 [04:15<00:00, 915.88it/s]
100%|██████████| 20039/20039 [13:53<00:00, 24.04it/s]
 50%|█████     | 2/4 [21:48<24:24, 732.30s/it]


for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/hei/sg_20039_not_splitted_for_eval.json:
avg_utter_len_by_token:  207.1990902729181
avg_conv_len_by_token:  2424.737062727681
Elapsed 1097.6068377494812 seconds.




100%|██████████| 215378/215378 [03:44<00:00, 959.33it/s]
100%|██████████| 20039/20039 [03:58<00:00, 84.18it/s]
 75%|███████▌  | 3/4 [29:38<10:12, 612.61s/it]


for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/hei/sd_20039_for_eval.json:
avg_utter_len_by_token:  203.13285479482585
avg_conv_len_by_token:  2183.662907330705
Elapsed 470.1716182231903 seconds.




100%|██████████| 166078/166078 [01:45<00:00, 1567.66it/s]
100%|██████████| 10000/10000 [01:37<00:00, 102.87it/s]
100%|██████████| 4/4 [33:05<00:00, 496.30s/it]


for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/hei/dw_10k.json:
avg_utter_len_by_token:  120.52562651284336
avg_conv_len_by_token:  2003.1139
Elapsed 206.98335146903992 seconds.







In [4]:
directory = r"/mntcephfs/lab_data/kongchuyi/s2/fastchat/data"  
files_name = []

for root, dirs, files in os.walk(directory):
    for file in files:
        if "10k" in file:
            files_name.append(os.path.join(root, file))
            
for file in tqdm(files_name):
    start = time.time()
    
    avg_utter_len = compute_avg_utters(file)
    avg_conv_len = compute_avg_convs(file)
    
    end = time.time()
    
    print(f'\nfor{file}:')
    print('avg_utter_len_by_token: ', avg_utter_len)
    print('avg_conv_len_by_token: ', avg_conv_len)
    print(f'Elapsed {end-start} seconds.\n')
    print()

  0%|          | 0/6 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2169 > 2048). Running this sequence through the model will result in indexing errors
100%|██████████| 61354/61354 [01:32<00:00, 666.78it/s]
100%|██████████| 10000/10000 [01:39<00:00, 100.13it/s]
 17%|█▋        | 1/6 [03:15<16:16, 195.35s/it]

for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/vicuna_10k_for_train.json:
avg_utter_len_by_token:  185.15452945203248
avg_conv_len_by_token:  1136.7103
Elapsed 195.3479278087616 seconds.


100%|██████████| 112024/112024 [02:48<00:00, 665.64it/s]
100%|██████████| 10000/10000 [02:53<00:00, 57.61it/s]
 33%|███▎      | 2/6 [09:03<19:00, 285.24s/it]

for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/cwb_raw_10k_duplicate_4096.json:
avg_utter_len_by_token:  177.83426765693065
avg_conv_len_by_token:  1992.7417
Elapsed 348.16182708740234 seconds.


100%|██████████| 75790/75790 [00:38<00:00, 1987.47it/s]
100%|██████████| 10000/10000 [00:20<00:00, 486.94it/s]
 50%|█████     | 3/6 [10:03<09:07, 182.51s/it]

for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/baize_10k_for_train.json:
avg_utter_len_by_token:  34.56260720411664
avg_conv_len_by_token:  263.122
Elapsed 60.259963274002075 seconds.


100%|██████████| 87654/87654 [02:52<00:00, 508.83it/s]
100%|██████████| 10000/10000 [03:20<00:00, 49.77it/s]
 67%|██████▋   | 4/6 [16:24<08:41, 260.61s/it]

for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/s_10k.json:
avg_utter_len_by_token:  252.8585575102106
avg_conv_len_by_token:  2216.8345
Elapsed 380.32848501205444 seconds.


100%|██████████| 107754/107754 [02:55<00:00, 612.86it/s]
100%|██████████| 10000/10000 [03:25<00:00, 48.74it/s]
 83%|████████▎ | 5/6 [22:51<05:06, 306.32s/it]

for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/s_raw_10k_duplicate_3146.json:
avg_utter_len_by_token:  202.54971509178313
avg_conv_len_by_token:  2182.9382
Elapsed 387.37149453163147 seconds.


100%|██████████| 76958/76958 [02:04<00:00, 620.61it/s]
100%|██████████| 10000/10000 [02:11<00:00, 75.87it/s]
100%|██████████| 6/6 [27:10<00:00, 271.82s/it]

for/mntcephfs/lab_data/kongchuyi/s2/fastchat/data/ultra_10k_for_train.json:
avg_utter_len_by_token:  187.24171626081758
avg_conv_len_by_token:  1441.9932
Elapsed 259.42885088920593 seconds.



