In [1]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer

from settings import *
from networks.networks import *

In [2]:
device1 = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device2 = 'cuda:2' if torch.cuda.is_available() else 'cpu'

### 导入数据与模型

In [3]:
# 加载数据
with open('./rawdata/weibo3.json') as f:
    data = json.load(f)

In [4]:
# 加载模型
model_root_path = './models/'
cls_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment')
ner_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese', use_fast=False)
smy_tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Randeng-BART-139M-SUMMARY')

cls_model = torch.load(os.path.join(model_root_path, 'attitude_classify.model')).to(device1).eval()
ner_model = torch.load(os.path.join(model_root_path, 'named_entity_recognition.model')).to(device2).eval()
smy_model = torch.load(os.path.join(model_root_path, 'text_summary.model')).to(device2).eval()

### 统计事件和心态

In [5]:
EVENTS = [] # {'entities': dict, 'post': str, 'summary': str, 'time': str, 'hot': {int: int}}
REPEATS = []
COMMENTS = []

In [6]:
# 判断输入事件是否与事件列表中的某个事件重复(重复返回与事件列表中重复的下标, 不重复返回-1)
def is_repeat(event, thresh):
    # 获取输入事件的key_words
    key_words = []
    for k in event['entities']:
        key_words += event['entities'][k]
    key_words = set(key_words)
    
    for i, e in enumerate(EVENTS):
        # 遍历每个事件的key_words
        s = []
        for k in e['entities']:
            s += e['entities'][k]
        s = set(s)
        # 重合度匹配
        ands = key_words & s
        if(ands):
            lens = min(len(key_words), len(s))
            # 重合度大于阈值则重复
            if(len(ands) / lens > thresh):
                return i
    return -1

In [7]:
# 将命名体识别模型输出结果转化为命名体类型与key_words映射关系的字典
def res2entities(res, summary):
    entities = {ID2ENTITY[i]: [] for i in range(1,12)}
    word = ''; last = -1
    # 遍历每个字对应的类型
    for n, i in enumerate(res.tolist()[1:-1]):
        # 0是None, 12是[CLS]或[PAD]或[SEP]
        if(i == 0 or i == 12):
            if(len(word) != 0 and last != -1):
                entities[ID2ENTITY[last]].append(word)
                word = ''
            last = -1
            continue
        # 出现下一个字的类型与上一个字类型不同
        if(i != last and last != -1 and len(word) != 0):
            entities[ID2ENTITY[last]].append(word)
            word = ''
            
        word += summary[n]
        last = i
    # 处理最后一个字(词)
    if(i != 0 and i!= 12):
        entities[ID2ENTITY[i]].append(word)    
    return entities

In [8]:
# 分析事件的一组评论心态
def att_cls(event, comments):
    # 定义小批量处理函数
    def collate_fn(event, data):
        inputs = cls_tokenizer.batch_encode_plus(
                [[event, c['content']] for c in data], return_tensors='pt', padding=True, add_special_tokens=True, return_token_type_ids=True
            ).to(device1)

        return data, inputs
    # 以batch的形式加载评论(避免显存溢出)
    loader = torch.utils.data.DataLoader(
        dataset=comments, batch_size=32, collate_fn=lambda batch: collate_fn(event, batch), shuffle=False, drop_last=False
        )
    
    for batch_comments, inputs in loader:
        # 对batch做心态分析
        out = cls_model(**inputs)['cls']
        res = out.argmax(dim=1)
        # 将该batch的心态分配到对应评论字典中
        for c in range(len(batch_comments)):
            batch_comments[c]['attitude'] = ID2ATTITUDE[res[c].item()]
        
    return comments

In [9]:
def static_item(item):
    # 获取数据
    event = item['event'].strip()
    post = item['post'].strip()
    time = item['time']
    ip = item['ip']
    thumbs = item['thumbs']
    comments = item['comments']
    
    if(post == '' or post == 'None'):
        return
    
    #计算热度
    hot = thumbs * len(comments) + sum([c['thumbs'] for c in comments if(c['thumbs'])])
    # 文本摘要
    post_inputs = smy_tokenizer.encode_plus(post, return_tensors='pt').to(device2)
    summary = smy_tokenizer.decode(smy_model.generate(post_inputs['input_ids'], max_length=128, do_sample=False)[0]).replace('</s>', '').strip()
    # 命名体识别
    summary_inputs = ner_tokenizer.encode_plus(text=summary, return_tensors="pt", padding=True, add_special_tokens=True, return_token_type_ids=False).to(device2)
    out = ner_model(**summary_inputs)['cls']
    res = out.argmax(dim=1)
    # 获取关键词
    entities = res2entities(res, summary)
    event = {'entities': entities, 'post':post, 'summary': summary, 'time': time, 'hot': {ip: hot}}
    idx = is_repeat(event, 0.5)
    # 对评论情感分析
    comments = att_cls(summary, comments)
    # 不重复则加入新事件和评论
    if(idx == -1):
        EVENTS.append(event)
        COMMENTS.append(comments)
    # 重复则合并
    else:
        EVENTS[idx]['entities'].update(event['entities'])
        EVENTS[idx]['hot'][ip] = EVENTS[idx]['hot'][ip] + hot if(EVENTS[idx]['hot'].get(ip, None) is not None) else hot
        COMMENTS[idx] += comments
        REPEATS.append(event)
    

In [10]:
# 遍历data开始统计事件
for key in data.keys():
    for item in tqdm(data[key]):
        static_item(item)

100%|██████████| 284/284 [02:50<00:00,  1.66it/s]
100%|██████████| 484/484 [03:48<00:00,  2.12it/s]
100%|██████████| 114/114 [00:40<00:00,  2.79it/s]
100%|██████████| 692/692 [06:51<00:00,  1.68it/s]
100%|██████████| 363/363 [02:52<00:00,  2.10it/s]
100%|██████████| 282/282 [01:47<00:00,  2.61it/s]
100%|██████████| 265/265 [01:27<00:00,  3.02it/s]
100%|██████████| 680/680 [04:57<00:00,  2.29it/s]


### 统计心态

In [12]:
COMMENTS[0]

[{'content': '是真的头大，矮啊',
  'time': '2023-02-06',
  'ip': 18,
  'thumbs': 27,
  'attitude': '肯定'},
 {'content': '是那个三千年美少女？？？别扯淡了不是国家承认的就别丢人现眼',
  'time': '2023-02-07',
  'ip': 6,
  'thumbs': 2,
  'attitude': '无聊'},
 {'content': '对矮的女的没感觉',
  'time': '2023-02-07',
  'ip': 9,
  'thumbs': 0,
  'attitude': '肯定'},
 {'content': '快快乐乐',
  'time': '2023-02-07',
  'ip': 21,
  'thumbs': 1,
  'attitude': '高兴'},
 {'content': '像个小孩瘦的只剩骨架了',
  'time': '2023-02-07',
  'ip': 22,
  'thumbs': 0,
  'attitude': '惊讶'},
 {'content': '很早的吧',
  'time': '2023-02-07',
  'ip': 13,
  'thumbs': 0,
  'attitude': '肯定'},
 {'content': '在人群中也就是正常身高，没必要嘲笑人家矮',
  'time': '2023-02-07',
  'ip': 18,
  'thumbs': 4,
  'attitude': '无所谓'},
 {'content': '精神小妹',
  'time': '2023-02-07',
  'ip': 18,
  'thumbs': 4,
  'attitude': '肯定'},
 {'content': '不认识',
  'time': '2023-02-07',
  'ip': 9,
  'thumbs': 2,
  'attitude': '无聊'},
 {'content': '小鞠美美哒，就是穿的不冷麻，宝贝要注意保暖哦',
  'time': '2023-02-07',
  'ip': 9,
  'thumbs': 1,
  'attitude': '肯定'},

### 保存

In [11]:
with open('./output_static/events.json', 'w', encoding='utf-8') as f:
    json.dump(EVENTS, f, ensure_ascii=False)

with open('./output_static/comments.json', 'w', encoding='utf-8') as f:
    json.dump(COMMENTS, f, ensure_ascii=False)

In [None]:
with open('./output_static/events_post.txt', 'w', encoding='utf-8') as f:
    for e in EVENTS:
        f.write(e['post'] + ': ' + e['summary'] + '\n')
        
with open('./output_static/repeat.txt', 'w', encoding='utf-8') as f:
    for e in REPEATS:
        f.write(e['post'] + ': ' + e['summary'] + '\n')

In [None]:
sorted(EVENTS, key=lambda x: sum([x['hot'][k] for k in x['hot']]), reverse=True)[:10]