In [None]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer

from settings import *
from networks.networks import *

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
with open('./rawdata/weibo3.json') as f:
    data = json.load(f)

In [None]:
model_root_path = './models/'
cls_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment')
ner_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese', use_fast=False)
smy_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Randeng-BART-139M-SUMMARY')

cls_model = torch.load(os.path.join(model_root_path, 'attitude_classify.model')).to(device).eval()
ner_model = torch.load(os.path.join(model_root_path, 'named_entity_recognition.model')).to(device).eval()
smy_model = torch.load(os.path.join(model_root_path, 'text_summary.model')).to(device).eval()

In [None]:
EVENTS = [] # {'entities': dict, 'post': str, 'summary': str, 'time': str, 'hot': {int: int}}
REPEATS = []
COMMENTS = []

In [None]:
def is_repeat(event, thresh):
    key_words = []
    for k in event['entities']:
        key_words += event['entities'][k]
    key_words = set(key_words)
    
    for i, e in enumerate(EVENTS):
        s = []
        for k in e['entities']:
            s += e['entities'][k]
        s = set(s)
        # 重合度匹配
        ands = key_words & s
        if(ands):
            lens = min(len(key_words), len(s))
            # 重合度大于阈值则重复
            if(len(ands) / lens > thresh):
                return i
    return -1

In [None]:
def res2entities(res, summary):
    entities = {ID2ENTITY[i]: [] for i in range(1,12)}
    word = ''; last = -1
    for n, i in enumerate(res.tolist()[1:-1]):
        if(i == 0 or i == 12):
            if(len(word) != 0 and last != -1):
                entities[ID2ENTITY[last]].append(word)
                word = ''
            last = -1
            continue
        
        if(i != last and last != -1 and len(word) != 0):
            entities[ID2ENTITY[last]].append(word)
            word = ''
            
        word += summary[n]
        last = i
        
    if(i != 0 and i!= 12):
        entities[ID2ENTITY[i]].append(word)    
    return entities

In [None]:
def static_item(item):
    # 获取数据
    event = item['event'].strip()
    post = item['post'].strip()
    time = item['time']
    ip = item['ip']
    thumbs = item['thumbs']
    comments = item['comments']
    
    if(post == '' or post == 'None'):
        return
    
    #计算热度
    hot = thumbs * len(comments) + sum([c['thumbs'] for c in comments if(c['thumbs'])])
    # 文本摘要
    post_inputs = smy_tokenizer.encode_plus(post, return_tensors='pt').to(device)
    summary = smy_tokenizer.decode(smy_model.generate(post_inputs['input_ids'], max_length=128, do_sample=False)[0]).replace('</s>', '').strip()
    # 命名体识别
    summary_inputs = ner_tokenizer.encode_plus(text=summary, return_tensors="pt", padding=True, add_special_tokens=True, return_token_type_ids=False).to(device)
    out = ner_model(**summary_inputs)['cls']
    res = out.argmax(dim=1)
    # 获取关键词
    entities = res2entities(res, summary)
    event = {'entities': entities, 'post':post, 'summary': summary, 'time': time, 'hot': {ip: hot}}
    idx = is_repeat(event, 0.5)
    # 不重复则加入新事件和评论
    if(idx == -1):
        EVENTS.append(event)
        COMMENTS.append(comments)
    # 重复则合并
    else:
        print('重复')
        EVENTS[idx]['entities'].update(event['entities'])
        EVENTS[idx]['hot'][ip] = EVENTS[idx]['hot'][ip] + hot if(EVENTS[idx]['hot'].get(ip, None) is not None) else hot
        COMMENTS[idx] += comments
        REPEATS.append(event)
    

In [None]:
for key in data.keys():
    for item in tqdm(data[key]):
        static_item(item)

### 保存

In [13]:
with open('./output_static/events.json', 'w', encoding='utf-8') as f:
    json.dump(EVENTS, f, ensure_ascii=False)

with open('./output_static/comments.json', 'w', encoding='utf-8') as f:
    json.dump(COMMENTS, f, ensure_ascii=False)

In [10]:
with open('./output_static/events_post.txt', 'w', encoding='utf-8') as f:
    for e in EVENTS:
        f.write(e['post'] + ': ' + e['summary'] + '\n')
        
with open('./output_static/repeat.txt', 'w', encoding='utf-8') as f:
    for e in REPEATS:
        f.write(e['post'] + ': ' + e['summary'] + '\n')

In [17]:
sorted(EVENTS, key=lambda x: sum([x['hot'][k] for k in x['hot']]), reverse=True)[:10]

[{'entities': {'时间': [],
   '地点': [],
   '触发词': [],
   '人物': ['小宋老师', '大家'],
   '物品': [],
   '动作': ['来', '出'],
   '事物': ['灯谜'],
   '机构': [],
   '数量': [],
   '单位': ['个'],
   '原因': []},
  'post': '提前过元宵，小宋老师来给大家出个灯谜！谜面是元宵[元宵]，谜底是',
  'summary': '小宋老师来给大家出个灯谜',
  'time': '2023-02-04',
  'hot': {0: 664240140}},
 {'entities': {'时间': [],
   '地点': [],
   '触发词': [],
   '人物': [],
   '物品': [],
   '动作': [],
   '事物': ['《少年时代》', '冒险之旅'],
   '机构': [],
   '数量': ['第43'],
   '单位': ['期'],
   '原因': []},
  'post': '概念预告游戏开场，转盘启动。这趟冒险一旦启程，必不会空手而归，让命运带你去找寻少年们沿路埋下的宝藏吧！？！时代少年团队长马嘉祺@时代少年团队长马嘉祺时代少年团丁程鑫@时代少年团丁程鑫时代少年团宋亚轩@时代少年团宋亚轩时代少年团刘耀文@时代少年团刘耀文时代少年团张真源@时代少年团张真源时代少年团严浩翔@时代少年团严浩翔时代少年团贺峻霖@时代少年团贺峻霖',
  'summary': '《少年时代》第43期:冒险之旅',
  'time': '2023-02-03',
  'hot': {18: 496611912}},
 {'entities': {'时间': [],
   '地点': [],
   '触发词': [],
   '人物': [],
   '物品': [],
   '动作': [],
   '事物': ['观点', '博客', '南之默'],
   '机构': [],
   '数量': [],
   '单位': [],
   '原因': []},
  'post': '吃还是不吃这是个问题',
  'summary': '观点·博客·南之默',
  'time': '20