In [59]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import json
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib
matplotlib.style.use('ggplot')
pd.set_option('max_colwidth',50)
import torch
from collections import Counter

from loguru import logger

# 数据的预处理

In [2]:
fname = 'data/processed/train.jsonl'

In [3]:
d = list(json.loads(x) for x in open(fname, 'r', encoding='utf-8').read().strip().split('\n'))
df = pd.DataFrame(d)

## 寻找共现标签并删除

### 寻找共现标签

##### 读取

In [5]:
d = list(json.loads(x) for x in open(fname, 'r', encoding='utf-8').read().strip().split('\n'))
df = pd.DataFrame(d)

In [11]:
b = df['label'].apply(lambda x: 'label_123' in x)

In [15]:
b.to_numpy().astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
# 获取所有label的set
label_set = set()
for e in df['label']:
    for e_label in e: label_set.add(e_label)

# 每一个label在句子中出现的布尔向量
# 将布尔向量改为10，作为二进制数转化为int，作为该label的fingerprint
s = df.shape[0]
label2binary = {}  # label -> fingerprint
fp2label = {}
for e in tqdm(label_set):
    pos_bool = df['label'].apply(lambda x: e in x).to_numpy().astype(int).tolist()
    fingerprint = int(''.join(list(str(x) for x in pos_bool)))
    if fingerprint not in fp2label:
        fp2label[fingerprint] = [e]
    else:
        fp2label[fingerprint].append(e)

# 找到fingerprint相同的label
same_groups = []
for v in fp2label.values():
    if len(v) >= 2:
        same_groups.append(v)

100%|██████████| 1399/1399 [00:51<00:00, 27.24it/s]


In [19]:
label2equivalent = {}
for e in same_groups:
    label2equivalent[e[0]] = e[1:]

In [20]:
json.dump(same_groups, open('temp_data/equivalent_labels.json', 'w', encoding='utf-8'))
json.dump(label2equivalent, open('temp_data/label2equivalent.json', 'w', encoding='utf-8'))

### 读取共现标签

In [21]:
same_groups = json.load(open('temp_data/equivalent_labels.json', 'r', encoding='utf-8'))
label2equivalent = json.load(open('temp_data/label2equivalent.json', 'r', encoding='utf-8'))

### 删除df中的一致标签

In [22]:
delete_set = set()
for e in same_groups:
    for d in e[1:]:
        delete_set.add(d)

In [23]:
def update_list(l):
    new_list = []
    for e in l:
        if e not in delete_set:
            new_list.append(e)
    return new_list

In [24]:
filtered_df = df.copy()
filtered_df['label'] = filtered_df['label'].apply(update_list)

In [25]:
filtered_df.to_pickle('temp_data/filtered_df.pkl')

### 读取

In [28]:
filtered_df = pd.read_pickle('temp_data/filtered_df.pkl')

## 分词

### 直接分词

In [29]:
import jieba

In [30]:
df_splitted = filtered_df.copy()
df_splitted['splitted'] = df_splitted['text'].apply(lambda x: list(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/v2/_14t7h052kgc9sf4crk2qb000000gn/T/jieba.cache
Loading model cost 0.430 seconds.
Prefix dict has been built successfully.


In [57]:
df_splitted.to_pickle('temp_data/df_splitted.pkl')

### 构造词表

In [32]:
vocabulary = set()
for e in tqdm(df_splitted['splitted']):
    vocabulary = vocabulary.union(set(e))
vocabulary = sorted(list(vocabulary))
vocabulary_idx = {x: i for i, x in enumerate(vocabulary)}
json.dump(vocabulary, open('temp_data/vocabulary.json', 'w', encoding='utf-8'))
json.dump(vocabulary_idx, open('temp_data/vocabulary_idx.json', 'w', encoding='utf-8'))

100%|██████████| 77314/77314 [01:24<00:00, 913.20it/s] 


In [33]:
vocabulary = json.load(open('temp_data/vocabulary.json', 'r', encoding='utf-8'))
vocabulary_idx = json.load(open('temp_data/vocabulary_idx.json', 'r', encoding='utf-8'))

### 构造词语在句子中的出现矩阵

In [46]:
df_splitted['splitted'][2]

['这个',
 '七星',
 '连珠',
 '的',
 '现象',
 '居然',
 '真的',
 '存在',
 '，',
 '我',
 '只',
 '在',
 '小说',
 '上面',
 '看过',
 '，',
 '太',
 '神奇',
 '了',
 '吧',
 '，',
 '至于',
 '其他',
 '的',
 '我',
 '也',
 '不',
 '太',
 '懂',
 '，',
 '科学',
 '现象',
 '还是',
 '需要',
 '很多',
 '依据',
 '证明',
 '的']

In [36]:
# shape=(vocab, sentence cnt)，词语在句子中的出现矩阵
vocab_appear = np.zeros((len(vocabulary), df_splitted.shape[0]), dtype=bool)
for i, row in tqdm(enumerate(df_splitted['splitted'])):
    for w in row:
        w_idx = vocabulary_idx[w]
        vocab_appear[w_idx][i] = True
csr_vocab_appear = csr_matrix(vocab_appear)

77314it [00:01, 61839.15it/s]


In [38]:
np.save('temp_data/csr_vocab_appear', csr_vocab_appear)

# 关键词方案

In [56]:
csr_vocab_appear = np.load('temp_data/csr_vocab_appear.npy', allow_pickle=True)
vocab_appear = csr_vocab_appear.todense()
df_splitted = pd.read_pickle('temp_data/df_splitted.pkl')

ValueError: Object arrays cannot be loaded when allow_pickle=False

## 构造标签矩阵

### 标签表

In [63]:
label_vocabulary = []
for e in tqdm(df_splitted['label']):
    label_vocabulary.extend(e)
label_count = sorted(list(Counter(label_vocabulary).items()))
label_vocabulary = list(x[0] for x in label_count)
label_count = list(x[1] for x in label_count)
label_vocabulary_idx = {x: i for i, x in enumerate(label_vocabulary)}

json.dump(label_vocabulary, open('temp_data/label_vocabulary.json', 'w', encoding='utf-8'))
json.dump(label_vocabulary_idx, open('temp_data/label_vocabulary_idx.json', 'w', encoding='utf-8'))
json.dump(label_count, open('temp_data/label_count.json', 'w', encoding='utf-8'))

100%|██████████| 77314/77314 [00:00<00:00, 1561049.72it/s]


### 标签在句子中的出现

In [64]:
# shape=(label_vocab, sentence cnt) 标签在句子中的出现矩阵
label_appear = np.zeros((len(label_vocabulary), df_splitted.shape[0]), dtype=bool)
for i, row in tqdm(enumerate(df_splitted['label'])):
    for l in row:
        l_idx = label_vocabulary_idx[l]
        label_appear[l_idx][i] = True
csr_label_appear = csr_matrix(label_appear)

77314it [00:00, 897204.51it/s]


In [65]:
np.save('temp_data/csr_label_appear', csr_label_appear)

In [66]:
vocab_appear.shape, label_appear.shape

((87875, 77314), (1023, 77314))

### 验证标签矩阵和词矩阵的正确和一致性

In [None]:
vocab_info = np.where(vocab_appear)
v_idx, s_idx = vocab_info[0], vocab_info[1]
for elem_vidx, elem_sidx in tqdm(list(zip(v_idx, s_idx))):
    cur_word = vocabulary[elem_vidx]
    assert cur_word in df_splitted['splitted'][elem_sidx]

 18%|█▊        | 365590/2012049 [00:01<00:06, 259624.31it/s]

In [None]:
label_info = np.where(label_appear)
l_idx, s_idx = label_info[0], label_info[1]
for elem_lidx, elem_sidx in tqdm(list(zip(l_idx, s_idx))):
    cur_label = label_vocabulary[elem_lidx]
    assert cur_label in df_splitted['label'][elem_sidx]

In [None]:
def frag_matmul(a, b, max_dim=100, verbose=False, use_cuda=False):
    # a = (a1, m), b = (m, b2)
    # 将a1和a2化为多个长为100的block
    a_rows, b_columns = [], []
    
    if verbose:
        logger.info('正在碎片化')
    dim_a, dim_b = a.shape[0], b.shape[1]
    a_frag_cnt = (dim_a + max_dim - 1) // max_dim
    b_frag_cnt = (dim_b + max_dim - 1) // max_dim
    for ia in range(a_frag_cnt):
        a_rows.append(a[ia * max_dim: min((ia + 1) * max_dim, a.shape[0])])
    for ib in range(b_frag_cnt):
        b_columns.append(b[:, ib * max_dim: min((ib + 1) * max_dim, b.shape[1])])
    
    results = []
    if verbose:
        logger.info('正在将碎片进行矩阵乘法')
    if use_cuda:
        for i_r in tqdm(range(a_frag_cnt)):
            results.append([])
            for i_c in range(b_frag_cnt):
                cur_a_row = torch.tensor(a_rows[i_r]).to(torch.float).to('cuda')
                cur_b_column = torch.tensor(b_columns[i_c]).to(torch.float).to('cuda')
                res = torch.matmul(cur_a_row, cur_b_column)
                results[-1].append(np.array(res.cpu(), dtype=int))
    else:
        for i_r in tqdm(range(a_frag_cnt)):
            results.append([])
            for i_c in range(b_frag_cnt):
                results[-1].append(np.matmul(a_rows[i_r], b_columns[i_c]))
    
    
    if verbose: logger.info('正在合并碎片')
    np_rows = []
    for erow in tqdm(results):
        np_rows.append(np.concatenate(erow, axis=1))
    np_result = np.concatenate(np_rows)
    return np_result
    

In [None]:
a = np.random.rand(100, 200)
b = np.random.rand(200, 100)
c = np.matmul(a, b)
d = frag_matmul(a, b, max_dim=30)
(c != d).sum()