In [1]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer

from settings import *
from networks.networks import *

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [3]:
with open('./rawdata/weibo3.json') as f:
    data = json.load(f)

In [7]:
model_root_path = './models/'
cls_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment')
ner_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese', use_fast=False)
smy_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Randeng-BART-139M-SUMMARY')

cls_model = torch.load(os.path.join(model_root_path, 'attitude_classify.model')).to(device).eval()
ner_model = torch.load(os.path.join(model_root_path, 'named_entity_recognition.model')).to(device).eval()
smy_model = torch.load(os.path.join(model_root_path, 'text_summary.model')).to(device).eval()

In [8]:
EVENTS = [] # {'entities': dict, 'summary': str}

In [20]:
def add_event(event):
    key_words = []
    for k in event['entities']:
        key_words += event['entities'][k]
    key_words = set(key_words)
    
    for e in EVENTS:
        s = []
        for k in e['entities']:
            s += e['entities'][k]
        s = set(s)
        if(key_words & s):
            print(list(key_words & s))

In [21]:
def res2entities(res, summary):
    entities = {ID2ENTITY[i]: [] for i in range(1,12)}
    word = ''; last = -1
    for n, i in enumerate(res.tolist()[1:-1]):
        if(i == 0 or i == 12):
            if(len(word) != 0 and last != -1):
                entities[ID2ENTITY[last]].append(word)
                word = ''
            last = -1
            continue
        
        if(i != last and last != -1):
            entities[ID2ENTITY[last]].append(word)
            word = ''
            
        word += summary[n]
        last = i
        
    if(i != 0 and i!= 12):
        entities[ID2ENTITY[i]].append(word)    
    return entities

In [22]:
def static_item(item):
    # 获取数据
    event = item['event'].strip()
    post = item['post'].strip()
    time = item['time']
    ip = item['ip']
    thumbs = item['thumbs']
    comments = item['comments']
    
    if(post == '' or post == 'None'):
        return

    # 文本摘要
    post_inputs = smy_tokenizer.encode_plus(post, return_tensors='pt').to(device)
    summary = smy_tokenizer.decode(smy_model.generate(post_inputs['input_ids'], max_length=128, do_sample=False)[0]).replace('</s>', '').strip()
    # 命名体识别
    summary_inputs = ner_tokenizer.encode_plus(text=summary, return_tensors="pt", padding=True, add_special_tokens=True, return_token_type_ids=False).to(device)
    out = ner_model(**summary_inputs)['cls']
    res = out.argmax(dim=1)
    # 获取关键词
    entities = res2entities(res, summary)
    add_event({'entities': entities, 'summary': summary})
    

In [23]:
for key in data.keys():
    for item in tqdm(data['热门'][0:10]):
        static_item(item)
    break
item

 10%|█         | 1/10 [00:01<00:13,  1.45s/it]

['机场航站楼', '偶遇', '鞠婧祎']


 20%|██        | 2/10 [00:02<00:11,  1.46s/it]

['大嫂', '是借位拍', '吻戏']


 30%|███       | 3/10 [00:04<00:10,  1.47s/it]

['后座', '狂补作业', '电动车', '在', '山东', '视频', '小朋友']
['在']
['在']
['视频']


 40%|████      | 4/10 [00:05<00:08,  1.38s/it]

['搞笑', '杨迪']


 50%|█████     | 5/10 [00:07<00:07,  1.42s/it]

['在']
['全球', '即将', '重映', '《泰坦尼克号》', '在']
['在']


 60%|██████    | 6/10 [00:08<00:05,  1.31s/it]

['在']
['在']
['在', '大理双廊', '古镇']


 70%|███████   | 7/10 [00:09<00:03,  1.31s/it]

['等', '家中', '他', '男子', '待', '无人为']


 80%|████████  | 8/10 [00:11<00:02,  1.42s/it]

['女子', '时隔', '找到丢失', '狗狗', '20余天']


 90%|█████████ | 9/10 [00:12<00:01,  1.37s/it]

['姐弟俩', '靠', '垃圾', '捡', '为生']


100%|██████████| 10/10 [00:13<00:00,  1.37s/it]

['视频']
['救援队', '视频']





{'event': '强震已致土耳其叙利亚超4000人遇难',
 'post': '一支在震区的救援队说我们正在向民众民防志愿者和伤员分发食物，然后我们看到这个孩子，护士说他没有家人了，和他在一起的人都死了。我们给了他一根香蕉，他就这样吃了他很累也很渴',
 'time': '2023-02-07',
 'ip': 20,
 'thumbs': 4734,
 'comments': [{'content': '有孩子的人见不得这个，心里堵',
   'time': '2023-02-07',
   'ip': 18,
   'thumbs': 629},
  {'content': '有个疑惑，这个妆容怎么有一点点眼熟啊，灾区孩子被救出来脸上都这样吗？之前白头盔报道的叙利亚难民孩子脸上也是这样',
   'time': '2023-02-07',
   'ip': 0,
   'thumbs': 14},
  {'content': '唉，好可怜，这眼神看着真难受', 'time': '2023-02-07', 'ip': 18, 'thumbs': 335},
  {'content': '妈的看得我要哭了受不了受不了', 'time': '2023-02-07', 'ip': 20, 'thumbs': 371},
  {'content': '希望能有户好人家收养吧', 'time': '2023-02-07', 'ip': 10, 'thumbs': 173},
  {'content': '大量的婴幼儿失去父母，会不会有人把他们贩卖到其他国家。。。',
   'time': '2023-02-07',
   'ip': 14,
   'thumbs': 98},
  {'content': '可怜的娃', 'time': '2023-02-07', 'ip': 0, 'thumbs': 57},
  {'content': '大难不死必有后福', 'time': '2023-02-07', 'ip': 18, 'thumbs': 18},
  {'content': '看不得这个，孩子受罪，亲人没了，今后更难[苦涩][苦涩]',
   'time': '2023-02-07',
   'ip': 18,
   'thumbs': 51},
  {'cont

In [19]:
a = set([1, 2, 3, 4, 5, 6])
b = set([4, 5, 6, 7, 8, 9])

if(a & b):
    print(list(a & b))

[4, 5, 6]


In [13]:
for i in data['热门']:
    if('鞠婧祎' in i['post']):
        print(i['post'])

机场航站楼门口偶遇鞠婧祎鞠婧祎这腰比我腿都细本人真的好美！！


In [24]:
EVENTS

[{'entities': {'时间': [],
   '地点': [],
   '触发词': ['偶遇'],
   '人物': ['鞠婧祎'],
   '物品': [],
   '动作': [],
   '事物': ['机场航站楼'],
   '机构': [],
   '数量': [],
   '单位': [],
   '原因': []},
  'summary': '机场航站楼偶遇鞠婧祎'},
 {'entities': {'时间': [],
   '地点': [],
   '触发词': [],
   '人物': ['大嫂'],
   '物品': [],
   '动作': ['是借位拍'],
   '事物': ['吻戏'],
   '机构': [],
   '数量': [],
   '单位': [],
   '原因': []},
  'summary': '大嫂的吻戏是借位拍的'},
 {'entities': {'时间': [],
   '地点': ['山东'],
   '触发词': [],
   '人物': ['小朋友'],
   '物品': ['电动车'],
   '动作': ['在', '狂补作业'],
   '事物': ['视频', '后座'],
   '机构': [],
   '数量': [],
   '单位': [],
   '原因': []},
  'summary': '视频:山东小朋友在电动车后座狂补作业'},
 {'entities': {'时间': [],
   '地点': [],
   '触发词': ['搞笑'],
   '人物': ['杨迪'],
   '物品': [],
   '动作': [],
   '事物': [],
   '机构': [],
   '数量': [],
   '单位': [],
   '原因': []},
  'summary': '杨迪也太搞笑'},
 {'entities': {'时间': ['即将'],
   '地点': [],
   '触发词': ['重映'],
   '人物': [],
   '物品': [],
   '动作': ['在'],
   '事物': ['《泰坦尼克号》', '全球'],
   '机构': [],
   '数量': [],
   '单位': [],
   '原因': []},
