In [1]:
import logging
import re
from typing import List, Union

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm import tqdm

In [2]:
def _read_data(
    source_data, target_sequence_length, is_return_list
) -> Union[List[List], List[str]]:
    def read_line(text_line):
        return text_line.strip().split("\t")

    token_docs = []
    tag_docs = []
    line_index = 0

    token_doc = []
    tag_doc = []
    if isinstance(source_data, List):
        pbar = tqdm(source_data)
    else:
        with open(source_data, "r") as data_file:
            pbar = tqdm(data_file.readlines())
    for index, line in enumerate(pbar):
        if line == "\n":
        #     token_docs.append(token_doc)
        #     tag_docs.append(tag_doc)
        #     pbar.update(len(token_doc))
        #     token_doc = []
        #     tag_doc = []
            continue
        processed_line = read_line(line)
        try:
            assert len(processed_line) == 2, "bad line"
            token, tag = processed_line
            token_doc.append(token)
            tag_doc.append(processed_line[1])
        except AssertionError:
            print(f"ignore the bad line: {line}, index: {index}")
            continue
        line_index += 1
        if len(token_doc) >= target_sequence_length:
            try:
                _verify_senquence(token_doc, target_sequence_length)
                _verify_senquence(tag_doc, target_sequence_length)
                if is_return_list:
                    token_docs.append(token_doc)
                else:
                    token_docs.append("".join(token_doc))
                tag_docs.append(tag_doc)
                token_doc = []
                tag_doc = []
            except AssertionError:
                print(f"error generating sequence: {token_doc}")
                token_doc = []
                tag_doc = []
                continue
            pbar.update(len(token_doc))
    try:
        assert (len(token_doc)==len(tag_doc)), "Not equal length"
        if is_return_list:
            token_docs.append(token_doc)
        else:
            token_docs.append("".join(token_doc))
        tag_docs.append(tag_doc)
        pbar.update(len(token_doc))
    except AssertionError:
        print(f"error generating sequence: {token_doc}")

    pbar.close()

    return token_docs, tag_docs


def _verify_senquence(sequence, target_sequence_length):
    assert (
        target_sequence_length <= len(sequence)
    ), "wrong sequence length"


def process_data(
    source_data, target_sequence_length, is_return_list=True
):
    """
    Function for generation of tokenized corpus and relevant tags

    Args:
        source_data(str or List): path of input data or input data
        target_sequence_length(int): target sequence length of one sample
    """
    print("load data")
    texts, tags = _read_data(
        source_data,
        target_sequence_length,
        is_return_list=is_return_list,
    )
    return texts, tags

In [3]:
with open("/root/autodl-tmp/datasets/mml-zh/token_tag_files/train_token_tag_data.txt", "r") as fb:
    train_source_data = fb.readlines()

train_texts, train_tags = process_data(train_source_data, 256)

load data


100%|██████████| 1220989/1220989 [00:01<00:00, 1054794.90it/s]


In [4]:
len(train_texts)

4394

In [6]:
print(train_texts[0])

['okay', '好', '第', '三', '环', '节', 'er', '角', '色', '扮', '演', 'okay', 'hello', '你', '好', 'oh', '我', '刚', '来', '到', '新', '加', '坡', '我', '是', '来', '自', 'malaysia', '的', '交', '换', '生', '啊', '所', '以', '我', '对', '新', '加', '坡', '就', '不', '是', '很', '熟', '悉', '啦', 'so', '我', '们', '目', '前', '是', '在', 'er', 'n', 't', 'u', 'so', '我', '想', '问', '一', '下', '要', '怎', '么', '如', '何', '从', 'n', 't', 'u', '去', 'er', '榜', '鹅', '呢', '对', '因', '为', '榜', '鹅', '有', '那', '个', 'uh', '海', '鲜', '嘛', '对', '对', '对', 'oh', 'okay', '那', '可', '以', '跟', '我', '讲', '是', '在', '啊', '对', '对', '对', '几', '时', '回', '来', '啊', '一', '个', '星', '期', '后', '所', '以', '在', '新', '加', '坡', '待', '一', '段', '时', '间', '啦', 'oh', '没', '有', '啦', '因', '为', '一', '个', '星', '期', '嘛', '就', '要', '问', '你', '很', '多', '问', '题', 'uh', '是', '住', '在', 'hotel', 'uh', '也', '是', '在', '市', '中', '心', '啊', 'bugis', 'but', '你', '可', '以', '跟', '我', '讲', '一', '下', '那', '个', 'er', '海', '鲜', '吃', '海', '鲜', '地', '方', '在', '哪', '里', '吗', 'okay', '哦', '可', '以', '跟', '我',

## Prediction with llama2

In [5]:
from llm_client.pipeline import Pipeline
from typing import List
import asyncio
import time

config_yaml = "/root/llm_client/config_yamls/llama2-hf.yaml"

pipeline = Pipeline(config_yaml, verbose=1)

  from .autonotebook import tqdm as notebook_tqdm
2023-10-22 22:38:21,125 - [32mINFO[0m - pipeline.py:20 - pipeline.__init__ - 10707 - parameters for every request: {'do_sample': False, 'max_new_tokens': 256, 'repetition_penalty': None, 'return_full_text': False, 'seed': None, 'temperature': None, 'top_k': None, 'top_p': None, 'truncate': None, 'typical_p': None, 'best_of': None, 'watermark': False, 'decoder_input_details': False, 'stop_sequences': ['</s>', '[/INST]', '[/SYS>>', 'Question']}


In [9]:
input_list = []

for train_text_list in train_texts:
    pure_text = "".join(train_text_list)
    input_list.append(f"Restore punctuations to the following sentence: '{pure_text}'")

In [10]:
print(input_list[0])

Restore punctuations to the following sentence: 'okay好第三环节er角色扮演okayhello你好oh我刚来到新加坡我是来自malaysia的交换生啊所以我对新加坡就不是很熟悉啦so我们目前是在erntuso我想问一下要怎么如何从ntu去er榜鹅呢对因为榜鹅有那个uh海鲜嘛对对对ohokay那可以跟我讲是在啊对对对几时回来啊一个星期后所以在新加坡待一段时间啦oh没有啦因为一个星期嘛就要问你很多问题uh是住在hoteluh也是在市中心啊bugisbut你可以跟我讲一下那个er海鲜吃海鲜地方在哪里吗okay哦可以跟我讲一下那边也有怎么样的美食呢uh你就由你来介绍吧okayohjumbo啊嗯okay那除了还有不是它还有好几间分店吗哦okayokay嗯嗯嗯okay那我想问一下那还有其他的分'


In [11]:
async def main(input_list: List, pipeline: Pipeline):
    tasks = [pipeline.model_predict(input) for input in input_list]
    results = await asyncio.gather(*tasks)
    return results

In [12]:
train_result_list = []
chunk_size = 50

question_list = input_list.copy()
pbar = tqdm(total = len(question_list))
while len(question_list) > chunk_size:
    current_chunk = question_list[:chunk_size]
    question_list = question_list[chunk_size:]
    
    train_result_list.extend(await main(current_chunk, pipeline))
    
    pbar.update(chunk)
    time.sleep(2)

train_result_list.extend(await main(question_list, pipeline))
pbar.update(len(question_list))

CancelledError: 

In [None]:
with open("/root/autodl-tmp/datasets/mml-zh/llm_results/llama2_13b/train_results.txt", "w") as output_file:
    for result_text in train_result_list:
        output_file.write(f"{result_text}\n")