In [2]:
import json
import random
import openpyxl
from openpyxl.styles import Alignment

In [3]:
def get_raw_data(path):
    with open(path) as f:
        data = json.load(f)
    return data

data = get_raw_data('./rawdata/weibo3.json')

### 文本摘要数据导出

In [None]:
events = []
posts = []
for i in data:
    for d in data[i]:
        events.append(d['event'])
        posts.append(d['post'])

In [None]:
def save_excel(posts, events, output_file_name):
    """
    将数据写入xlsx文件
    """
    if not output_file_name.endswith('.xlsx'):
        output_file_name += '.xlsx'
 
    # 创建一个workbook对象，而且会在workbook中至少创建一个表worksheet
    wb = openpyxl.Workbook()
    # 获取当前活跃的worksheet,默认就是第一个worksheet
    ws = wb.active
    align = Alignment(horizontal='center', vertical='center', wrap_text=True)
    ws.column_dimensions['A'].width = 40.0
    ws.column_dimensions['B'].width = 40.0

    # 写入表头
    ws.cell(row=1, column=1).value = '原文本数据'
    ws.cell(row=1, column=2).value = '标签数据（摘要）'
    ws.cell(row=1, column=1).alignment = align
    ws.cell(row=1, column=2).alignment = align

    for i in range(len(posts)):
        ws.cell(row=2+i, column=1).value = posts[i]
        ws.cell(row=2+i, column=1).alignment = align
        if(events[i] != 'None'):
            ws.cell(row=2+i, column=2).value = events[i]
        ws.cell(row=2+i, column=2).alignment = align
 
    # 保存表格
    wb.save(filename=output_file_name)
    return True

In [None]:
tp = list(zip(posts,events))
random.shuffle(tp)
posts = [i[0] for i in tp]
events = [i[1] for i in tp]

lens = len(posts)
t = int(len(posts) / 5)
names = ['李帅', '周云弈', '刘熠杨', '周芳妍', '刘天一']
for i in range(5):
    save_excel(posts[i*t:(i+1)*t], events[i*t:(i+1)*t], f'./dataset/text_summary/text_summary_{names[i]}.xlsx')

#### 通过模型生成文本摘要导出

In [None]:
import torch
import random
from tqdm import tqdm
from transformers import BartForConditionalGeneration, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model=torch.load('./models/text_summary.model')
tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Randeng-BART-139M-SUMMARY')

def make_summary_from_execel(model, path):
    model = model.to(device)
    
    workbook = openpyxl.load_workbook(path)
    table = workbook.active
    rows = table.max_row

    for row in tqdm(range(2, rows)):
        text = table.cell(row, 1).value
        inputs = tokenizer.encode_plus(text, return_tensors='pt')
        res = tokenizer.decode(model.generate(inputs['input_ids'].to(device), max_length=128, do_sample=False)[0]).replace('</s>', '').strip()
        table.cell(row, 2).value = res

    workbook.save(path)
    return True

# make_summary_from_execel(model, './dataset/text_summary/self_summary/text_summary_李帅.xlsx')

### 命名体识别数据导出

In [None]:
import os
from tqdm import tqdm

def get_summary(root):
    summary = []
    for p in os.listdir(root):
        workbook = openpyxl.load_workbook(os.path.join(root,p))
        table = workbook.active
        rows = table.max_row
        for row in tqdm(range(2, rows+1)):
            text = table.cell(row, 2).value
            if(text and text.strip()!= ''):
                summary.append(text)
                
        workbook.close()
    return summary

In [None]:
summary = get_summary('./dataset/text_summary/self_summary/labeled/')
with open("./dataset/name_recognition/doccano/summary.txt", "w", encoding='utf-8') as f:
    for i in summary:
        f.write(i + '\n')

### 从原数据到情感分类标签数据导出

In [None]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer

device = 'cuda' if(torch.cuda.is_available()) else 'cpu'
print(device)
data = get_raw_data('./rawdata/weibo3.json')
model = torch.load('./models/text_summary2.model').to(device)
tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Randeng-BART-139M-SUMMARY')

def make_comments(item, top=None):
    inputs = tokenizer.encode_plus(item['post'], return_tensors='pt')
    summary = tokenizer.decode(model.generate(inputs['input_ids'].to(device), max_length=128, do_sample=False)[0]).replace('</s>', '').strip()
    comments = []
    nums = 0
    if(summary):
        for comment in item['comments']:
            comment = comment['content'].strip()
            if(comment and comment != 'None'):
                comment = summary + '###' + comment
                comments.append(comment)
                nums += 1
            if(top and nums >= top):
                break

    return comments

outputs = []
for key in data:
    column = data[key]
    random.shuffle(column)
    for item in tqdm(column[:50]):  # 每个栏目随机选50条微博
        comments = make_comments(item, top=20)  # 取前20条评论
        if(len(comments) > 0):
            outputs += comments
            
outputs[:10]

In [None]:
with open('./dataset/attitude_classify/doccano/commtens4.txt', 'w') as f:
    for line in outputs:
        f.write(line + '\n')

### 从原数据到命名体识别标签数据导出

In [4]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer

device = 'cuda' if(torch.cuda.is_available()) else 'cpu'
print(device)
data = get_raw_data('./rawdata/weibo3.json')
model = torch.load('./models/text_summary2.model').to(device)
tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Randeng-BART-139M-SUMMARY')

def generate_summary(item, top=None):
    inputs = tokenizer.encode_plus(item['post'], return_tensors='pt')
    summary = tokenizer.decode(model.generate(inputs['input_ids'].to(device), max_length=128, do_sample=False)[0]).replace('</s>', '').strip()
    if(summary):
        return [summary]
    else:
        return []

outputs = []
for key in data:
    column = data[key]
    random.shuffle(column)
    for item in tqdm(column[:50]):  # 每个栏目随机选50条微博
        summary = generate_summary(item)
        if(len(summary) > 0):
            outputs += summary
            
outputs[:10]

cuda


100%|██████████| 50/50 [00:11<00:00,  4.42it/s]
100%|██████████| 50/50 [00:10<00:00,  4.95it/s]
100%|██████████| 50/50 [00:09<00:00,  5.20it/s]
100%|██████████| 50/50 [00:10<00:00,  4.82it/s]
 92%|█████████▏| 46/50 [00:09<00:00,  4.06it/s]

In [None]:
with open('./dataset/name_recognition/doccano/summary2.txt', 'w') as f:
    for line in outputs:
        f.write(line + '\n')

### 命名体识别jsonl数据处理触发词动作同时出现

In [10]:
import json

def remove_repeat(item):
    trigger_b = trigger_e = -1
    for i in item['label']:
        if(i[2] == '触发词'):
            trigger_b = i[0]
            trigger_e = i[1]
    if(trigger_b!= -1 and trigger_e!= -1):
        idx = -1
        for n, i in enumerate(item['label']):
            if(i[0] == trigger_b and i[1] == trigger_e and i[2] != '触发词'):
                idx = n
        item['label'].pop(idx)
    return item


path = "./dataset/name_recognition/all.jsonl"

# 读文件并处理
data = []
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        line_js = json.loads(line)
        line_js = remove_repeat(line_js)
        data.append(json.dumps(line_js, ensure_ascii=False))
        
# 写回
with open(path, "w", encoding="utf-8") as f:
    for line in data:
        f.write(line + '\n')