In [None]:
import os
import json
import torch
import pandas
from ltp import LTP
from tqdm import tqdm

### 读取本地数据

In [None]:
def get_raw_data_sentences(path):    
    data = json.load(open(path, "r", encoding="utf-8"))
    sentences = []
    for key in data:
        for d in data[key]:
            # 帖子
            sentence = d['post'].strip()
            if(sentence != '' and sentence != 'None'):
                sentences.append(sentence)
            # 评论
            for comment in d['comments']:
                sentence = comment['content'].strip()
                if(sentence != '' and sentence != 'None'):
                    sentences.append(sentence)
    return sentences


root = './rawdata/'
sentences = []
for p in os.listdir(root):
    path = os.path.join(root, p)
    sentences += get_raw_data_sentences(path)

### 统计词频

In [None]:
ltp = LTP()
if torch.cuda.is_available():
    ltp.to("cuda")

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    
dataset = Dataset(sentences)
loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1024, drop_last=False)

In [None]:
final_words = []
for inputs in tqdm(loader):
    with torch.no_grad():
        words = ltp.pipeline(inputs, tasks = ["cws", 'pos'], return_dict = False)
    for w in zip(words[0], words[1]):
        final_words += list(zip(w[0], w[1]))

final_words = pandas.DataFrame(final_words)
final_words = list(final_words.value_counts().items())

# with open("./dataset/key_words/keywords1.txt", "w", encoding="utf-8") as f:
#     for i in final_words:
#         f.write(i[0][0] + ',' + i[0][1] + ',' + str(i[1]) + "\n")

### 处理关键词

In [None]:
# 加载停用词
with open('./dataset/key_words/stopwords/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.readlines()
    stopwords = [w.strip() for w in stopwords]
stopwords = set(stopwords)

In [None]:
# 加载关键词
with open('./dataset/key_words/keywords.txt', 'r', encoding='utf-8') as f:
    keywords = f.readlines()
    keywords = [w.strip().split(',') for w in keywords]

In [None]:
filted = filter(lambda x: (x[0] not in stopwords) and (x[1] in ['ns', 'n', 'nh', 'v']) and (len(x[0]) > 1), keywords)
filted = list(filted)
filted.sort(key=lambda x: int(x[2]), reverse=True)

In [None]:
filted