In [1]:
import logging
import re
from typing import List, Union

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm import tqdm

In [7]:
def _read_data(
    source_data, target_sequence_length, is_return_list
) -> Union[List[List], List[str]]:
    def read_line(text_line):
        return text_line.strip().split("\t")

    token_docs = []
    tag_docs = []
    line_index = 0

    token_doc = []
    tag_doc = []
    if isinstance(source_data, List):
        pbar = tqdm(source_data)
    else:
        with open(source_data, "r") as data_file:
            pbar = tqdm(data_file.readlines())
    for index, line in enumerate(pbar):
        if line == "\n":
        #     token_docs.append(token_doc)
        #     tag_docs.append(tag_doc)
        #     pbar.update(len(token_doc))
        #     token_doc = []
        #     tag_doc = []
            continue
        processed_line = read_line(line)
        try:
            assert len(processed_line) == 2, "bad line"
            token, tag = processed_line
            token_doc.append(token)
            tag_doc.append(processed_line[1])
        except AssertionError:
            print(f"ignore the bad line: {line}, index: {index}")
            continue
        line_index += 1
        if len(token_doc) >= target_sequence_length:
            try:
                _verify_senquence(token_doc, target_sequence_length)
                _verify_senquence(tag_doc, target_sequence_length)
                if is_return_list:
                    token_docs.append(token_doc)
                else:
                    token_docs.append("".join(token_doc))
                tag_docs.append(tag_doc)
                token_doc = []
                tag_doc = []
            except AssertionError:
                print(f"error generating sequence: {token_doc}")
                token_doc = []
                tag_doc = []
                continue
            pbar.update(len(token_doc))
    try:
        assert (len(token_doc)==len(tag_doc)), "Not equal length"
        if is_return_list:
            token_docs.append(token_doc)
        else:
            token_docs.append("".join(token_doc))
        tag_docs.append(tag_doc)
        pbar.update(len(token_doc))
    except AssertionError:
        print(f"error generating sequence: {token_doc}")

    pbar.close()

    return token_docs, tag_docs


def _verify_senquence(sequence, target_sequence_length):
    assert (
        target_sequence_length <= len(sequence)
    ), "wrong sequence length"


def process_data(
    source_data, target_sequence_length, is_return_list=True
):
    """
    Function for generation of tokenized corpus and relevant tags

    Args:
        source_data(str or List): path of input data or input data
        target_sequence_length(int): target sequence length of one sample
    """
    print("load data")
    texts, tags = _read_data(
        source_data,
        target_sequence_length,
        is_return_list=is_return_list,
    )
    return texts, tags

In [8]:
with open("/root/autodl-tmp/datasets/mml-zh/token_tag_files/train_token_tag_data.txt", "r") as fb:
    train_source_data = fb.readlines()

train_texts, train_tags = process_data(train_source_data, 256)

load data


100%|██████████| 1220989/1220989 [00:01<00:00, 1105926.89it/s]


In [9]:
len(train_texts)

4394

## Prediction with llama2

In [10]:
from llm_client.pipeline import Pipeline
from typing import List
import asyncio
import time

config_yaml = "/root/llm_client/config_yamls/llama2-hf.yaml"

pipeline = Pipeline(config_yaml, verbose=1)

  from .autonotebook import tqdm as notebook_tqdm


TypeError: __init__() got an unexpected keyword argument 'verbose'