In [1]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
#讀取文本轉成分割檔
import re
import os
import csv
import pandas as pd

def split_text(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
    # 使用正規表達式進行分割（考慮句點、空白和換行符號）
    #pattern = re.compile(r'(\.\s|\n)(?![^\(]*\))(?!(?<=Dr\.\s))(?![^()]*\))(?!(?<=D\.\s))')
    pattern = re.compile(r'(\.\s|\n)(?![^\(]*\))(?![^()]*\))(?!(?<=D\.\s))(?!(?<=Dr\.\s)[^\s]*\s)')
    sentences = re.split(pattern, content)
    sentences = [item for item in sentences if item is not None]
    output = []

    current_position = 0
    for i in range(0, len(sentences)-1, 2):  # 修改範圍，避免超出列表範圍
        sentence = sentences[i] + sentences[i + 1]  # 合併句點和後面的空白或換行符號
        if sentence.strip() or sentence == '\n':  # 跳過空白句子
            start_position = current_position  # 起始位置改為目前位置
            end_position = start_position + len(sentence)
            output.append((start_position, end_position, sentence))
            current_position = end_position

    return output



def print_output_to_dataframe(output):
    input_file_name = os.path.splitext(os.path.basename(file_name))[0]
    input_data = {'InputFile': [], 'LineNumber': [], 'Paragraph': []}

    for index, entry in enumerate(output, start=1):
        paragraph = entry[2]
        if paragraph != '':
            input_data['InputFile'].append(input_file_name)
            input_data['LineNumber'].append(entry[0])
            input_data['Paragraph'].append(paragraph)
    df = pd.DataFrame(input_data)
    return df




folder_path = "/content/drive/MyDrive/opendid_test"#你的文本路徑
files = os.listdir(folder_path)
sorted_files = sorted(files)

result_df = pd.DataFrame(columns=['InputFile', 'LineNumber', 'Paragraph'])

for file_name in sorted_files:
    file_path = os.path.join(folder_path, file_name)
    result = split_text(file_path)
    result_df = pd.concat([result_df, print_output_to_dataframe(result)], ignore_index=True)



# 匯出 DataFrame 到 'Split.tsv' 檔案
print(result_df)
result_df['Paragraph'] = result_df['Paragraph'].str.replace('\t',' ')
result_df['Paragraph'] = result_df['Paragraph'].str.replace('\n',' ')
result_df.to_csv('Split.tsv', sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\', doublequote=False)



In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup, AdamW
from tqdm import tqdm
from datasets import load_dataset, Features, Value

In [5]:
# 載入預訓練模型和分詞器
pretrained_model_name="/content/drive/MyDrive/epoch_10"
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 分詞器設定
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'
special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}
tokenizer.add_special_tokens(special_tokens_dict)
PAD_IDX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
IGNORED_PAD_IDX = -100
tokenizer.padding_side = 'left'

NameError: ignored

In [None]:
# 載入資料集
valid_data = load_dataset("csv", data_files="/content/Split.tsv", delimiter='\t',
                          features=Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')
                          }),
                          column_names=['fid', 'idx', 'content', 'label'])
valid_list = list(valid_data['train'])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# 使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

In [None]:
tokenizer.padding_side = 'left'
def sample_batch(model, tokenizer, input):
    """Generate text from a trained model."""
    model.eval()
    seeds = [f"{bos} {text['content']} {sep}" for text in input]
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    outputs = []
    #return
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = PAD_IDX,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
            pred = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
            if pred == "PHI: NULL":
                continue
            phis = pred.split('\n')
            lidxs = {}
            for p in phis:
                tid = p.find(':')
                if tid > 0:
                    text = p[tid+1:].strip()
                    nv = text.find('=>')
                    normalizedV = None
                    #
                    if nv>0:
                      normalizedV = text[nv+2:]
                      text = text[:nv]


                    #
                    lidx = 0
                    if text in lidxs:
                        lidx = lidxs[text]
                    lidx = input[idx]['content'].find(text, lidx)
                    eidx = lidx+len(text)
                    lidxs[text] = eidx
                    sidx=int(input[idx]['idx'])
                    if normalizedV is None:
                        outputs.append(f'{input[idx]["fid"]}\t{p[:tid]}\t{lidx+sidx}\t{eidx+sidx}\t{text}')
                    else:
                        outputs.append(f'{input[idx]["fid"]}\t{p[:tid]}\t{lidx+sidx}\t{eidx+sidx}\t{text}\t{normalizedV}')
    return outputs

f = open("answer.txt", "w",encoding="UTF-8")
BATCH_SIZE = 8
for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
    with torch.no_grad():
        seeds = valid_list[i:i+BATCH_SIZE]
        outputs = sample_batch(model, tokenizer, input=seeds)
        for o in outputs:
            f.write(o)
            f.write('\n')
f.close()

100%|██████████| 9873/9873 [35:55<00:00,  4.58it/s]


In [None]:
file_path = 'answer.txt'
output_path = 'answer0.txt'

with open(file_path, 'r') as input_file, open(output_path, 'w', encoding='utf-8') as output_file:
    for line in input_file:
        # 移除行尾的換行符
        line = line.strip()
        # 分割資料行
        data = line.split('\t')
        # 檢查資料行是否缺少最後一項
        if (len(data) >= 5):
            # 如果有最後一項，則寫入到新的檔案中
            output_file.write(line + '\n')


UnicodeDecodeError: ignored

In [None]:
def process_data(input_file, output_file):
    with open(input_file, 'r') as input_file, open(output_file, 'w', encoding='utf-8') as output_file:
        previous_data = set()

        for line in input_file:
            data_identifier = line.split('\t')[:5]
            data_identifier = '\t'.join(data_identifier)

            # 如果這個資料與前一行相同，則跳過這行
            if data_identifier in previous_data:
              #print(line)
              continue

            # 如果資料不同，則將其輸出到輸出檔案中
            output_file.write(line)

            # 更新已經見過的資料集合
            previous_data.add(data_identifier)



A = open("answer1.txt", "w",encoding="ascii")
process_data('answer0.txt', 'answer1.txt')
A.close()

In [None]:
file_path = 'answer1.txt'
output_path = 'answer2.txt'


with open(file_path, 'r') as input_file, open(output_path, 'w') as output_file:
    for line in input_file:
        # 移除行尾的換行符
        line = line.strip()
        # 分割資料行
        data = line.split('\t')

        # 檢查資料行是否缺少最後一項
        if (len(line) >= 150):
             #如果有重複，跳過
             print(f"Error in line : {line}")

