In [9]:
import torch 
import pandas as pd
import numpy as np
from __init__ import *
from src.utils import config
from transformers import BertModel, BertTokenizer, BertConfig, BertForSequenceClassification
from src.DL.models.bert import Config, Model
import json 


In [10]:
df = pd.concat([pd.read_csv(config.root_path + '/data/test_clean.tsv', sep='\t'),
                pd.read_csv(config.root_path + '/data/dev_clean.tsv', sep='\t'),
                pd.read_csv(config.root_path + '/data/train_clean.tsv', sep='\t')]).dropna().reset_index(drop=True)
df.head()

Unnamed: 0,text,label,category_id
0,勇闯 法兰西 此书 的 主人公 罗维孝是 国网 的 一名 退休工人 他 曾 骑车 登上 世界...,文学,0
1,历代 茶 诗集 成宋 金卷 本书 主要 内容 包括 : 丁开 摘句 一首 、 丁带 茶 诗 ...,文学,0
2,"蜗牛 作者 用 整整 一部 诗集 在 探索 旧词 新意 的 核心 问题 , 作者 在 后记 ...",文学,0
3,"点石成金 雕塑 实验教学 美术 实验教学 丛书 点石成金 : 雕塑 实验教学 的 普及 , ...",艺术,8
4,文学 原理 新释 这本 文学 原理 新释 在 历经 寒暑 瑞至 岁末 的 时候 终于 脱稿 ...,文学,0


In [11]:
class Predict(object):
    def __init__(self, model_path=config.root_path + '/model/saved_dict/bert.ckpt', 
                 bert_path=config.root_path + '/model/bert-wwm/',
                 is_cuda=config.is_cuda):
        self.model_path = model_path
        self.tokenizer = BertTokenizer.from_pretrained(bert_path)
        self.is_cuda = is_cuda
        conf = Config(dataset=config.root_path + '/')
        self.model = Model(conf).to(config.device)
        checkpoint = torch.load(self.model_path)
        self.model.load_state_dict(checkpoint, strict=False)
        self.model.eval()
        
    def process_data(self, text, is_cuda=config.is_cuda):
        def padding(indice, max_length, pad_idx=0):
            """
            pad 函数
            注意 token type id 右侧pad是添加1而不是0，1表示属于句子B
            """
            pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
            return torch.tensor(pad_indice)
        text_dict = self.tokenizer.encode_plus(text,                      # Sentence to encode.
                   add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
                   max_length=config.max_length,             # Pad & truncate all sentences.
                   ad_to_max_length=True,
                   return_attention_mask=True,   # Construct attn. masks.
#                                                    return_tensors='pt',     # Return pytorch tensors.
                   )
    
        input_ids, attention_mask, token_type_ids = text_dict['input_ids'], text_dict['attention_mask'], text_dict['token_type_ids']

        token_ids_padded = padding([input_ids], config.max_length)
        token_type_ids_padded = padding([token_type_ids], config.max_length)
        attention_mask_padded = padding([attention_mask], config.max_length)
        return token_ids_padded, token_type_ids_padded, attention_mask_padded
        
    def predict(self, text):
        token_ids_padded, token_type_ids_padded, attention_mask_padded = self.process_data(text)
        if self.is_cuda:
            token_ids_padded = token_ids_padded.to(torch.device('cuda'))
            token_type_ids_padded = token_type_ids_padded.to(torch.device('cuda'))
            attention_mask_padded = attention_mask_padded.to(torch.device('cuda'))
        outputs = self.model((token_ids_padded, attention_mask_padded, token_type_ids_padded))
        label = torch.max(outputs.data, 1)[1].cpu().numpy()[0]
        score = outputs.data[0][torch.max(outputs.data, 1)[1].cpu().numpy()[0]].cpu().numpy().tolist()
        return label, score

In [12]:
pred = Predict()

I0511 10:15:32.291856 140495557105408 tokenization_utils.py:420] Model name '/home/user10000254/notespace/textClassification/model/bert-wwm/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1, bert-base-dutch-cased). Assuming '/home/user10000254/notespace/textClassification/model/bert-wwm/' is a path, a model identifier, or url to a directory containing tokenizer files.
I0511 10:15:32.303939 140495557105408 tokenization_utils.py:449] Didn't find file /home/user10000254/notespace/textClassific

In [13]:
from tqdm import tqdm 
tqdm.pandas()
df1 = df['text'].progress_apply(lambda x: pred.predict(x))

 10%|█         | 29577/294738 [50:45<9:03:19,  8.13it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 14%|█▍        | 41948/294738 [1:12:04<8:27:17,  8.31it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 62%|██████▏   | 183411/294738 [5:24:05<3:18:32,  9.35it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable


In [15]:
df['pred_label'] = df1.apply(lambda x: int(x[0]))
df['pred_score'] = df1.apply(lambda x: x[1])
cat2id = json.load(open(config.root_path + '/data/label2id.json'))
id2cat = {v:k for k, v in cat2id.items()}
df['pred_label_name'] = df['pred_label'].map(id2cat)
# df['label'] = df['label'].map(dct)
# df['pred_label_name'] = df['pred_label_name'].map(dct)
df.head()

Unnamed: 0,text,label,category_id,pred_label,pred_score,pred_label_name
0,勇闯 法兰西 此书 的 主人公 罗维孝是 国网 的 一名 退休工人 他 曾 骑车 登上 世界...,文学,0,0,8.291163,文学
1,历代 茶 诗集 成宋 金卷 本书 主要 内容 包括 : 丁开 摘句 一首 、 丁带 茶 诗 ...,文学,0,0,10.474462,文学
2,"蜗牛 作者 用 整整 一部 诗集 在 探索 旧词 新意 的 核心 问题 , 作者 在 后记 ...",文学,0,0,10.4044,文学
3,"点石成金 雕塑 实验教学 美术 实验教学 丛书 点石成金 : 雕塑 实验教学 的 普及 , ...",艺术,8,24,7.28434,工业技术
4,文学 原理 新释 这本 文学 原理 新释 在 历经 寒暑 瑞至 岁末 的 时候 终于 脱稿 ...,文学,0,0,9.938389,文学


In [16]:
df2 = df[df['pred_label_name'] != df['label']]

In [17]:
df[df['pred_label_name'] != df['label']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37336 entries, 3 to 294714
Data columns (total 6 columns):
text               37336 non-null object
label              37336 non-null object
category_id        37336 non-null int64
pred_label         37336 non-null int64
pred_score         37336 non-null float64
pred_label_name    37336 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 2.0+ MB


In [23]:
from sklearn import metrics
print(metrics.accuracy_score(df['category_id'], df['pred_label']))
print(metrics.classification_report(df['category_id'], df['pred_label'], target_names=[x.strip() for x in open(
            '../../data/class.txt').readlines()]   , digits=4))
metrics.confusion_matrix(df['category_id'], df['pred_label'])

0.8733247833669225
             precision    recall  f1-score   support

         少儿     0.9290    0.9669    0.9476     68307
         教材     0.8809    0.7923    0.8343      8336
         文学     0.6503    0.8422    0.7339      5686
       工业技术     0.8556    0.9050    0.8796     31237
      中小学教辅     0.9340    0.8800    0.9062      7458
         艺术     0.9334    0.8152    0.8703      9113
       社会科学     0.6859    0.3603    0.4725      3594
         小说     0.8338    0.8554    0.8445     10048
      计算机网络     0.8766    0.8711    0.8738     12106
         管理     0.8427    0.8005    0.8211      1860
         建筑     0.9378    0.9443    0.9410     30154
         外语     0.8245    0.7804    0.8018      1571
         历史     0.7958    0.8352    0.8150      6937
         法律     0.7218    0.6339    0.6750     10471
       政治军事     0.9682    0.8990    0.9323      4099
       哲学宗教     0.9312    0.8764    0.9030      4524
         经济     0.8301    0.8018    0.8157      2346
         医学     0.9138    

  .format(len(labels), len(target_names))


array([[66046,    10,   190, ...,    25,     2,     7],
       [   25,  6605,   178, ...,     1,     0,     0],
       [   90,    17,  4789, ...,     0,     1,     0],
       ...,
       [   24,     1,     0, ...,  1592,    18,     1],
       [    9,     0,     0, ...,    44,  1758,     1],
       [   47,     0,     0, ...,     1,     3,   171]])

In [39]:
def change_label(row):
    text = row['text']
    pred_score = row['pred_score']
    pred_label_name = row['pred_label_name']
    label = row['label']
    res = label
    if ('规划 教材' in text) and (pred_label_name == '大中专教材教辅'):
        res = '大中专教材教辅'
    elif ('文学' == label) and (pred_label_name == '小说'):
        res = '小说'
    elif (('高等职业' in text) or ('大学' in text) or ('高职' in text) or \
          ('中等职业' in text) or ('教育部' in text) or ('高等数学' in text) or\
          ('高等院校' in text) or ('教程' in text) or ('教材' in text) or \
          ('高等院校' in text) or (pred_score > 8.))\
        and (pred_label_name == '大中专教材教辅') and (label != '大中专教材教辅'):
        res = '大中专教材教辅'
    elif ('小说' == label) and (pred_label_name == '文学'):
        res = '文学'
    elif (pred_score > 8.) and (pred_label_name != label):
        res = pred_label_name
    elif ('小说' == label) and (pred_label_name == '文学'):
        res = '文学'
    return res

df['label'] = df.apply(lambda row: change_label(row), axis=1)

In [86]:
df[df['pred_label_name'] != df['label']].shape

(57308, 6)

In [88]:
df = df[(df['pred_score'] > 6.5) & (df['pred_label_name'] == df['label'])]

In [89]:
df.shape

(145734, 6)

In [90]:
df1 = pd.concat([pd.read_csv(config.root_path + '/data/test_clean.tsv', sep='\t'),
                pd.read_csv(config.root_path + '/data/dev_clean.tsv', sep='\t'),
                pd.read_csv(config.root_path + '/data/train_clean.tsv', sep='\t')]).reset_index(drop=True)
df1.shape

(230401, 2)

In [43]:
df.shape

(302730, 6)

In [8]:
df = df[['text', 'label', 'category_id']]
train, dev, test = np.split(df.sample(frac=1), [int(df.shape[0] * 0.7), int(df.shape[0] * 0.9)])
train.to_csv('../../data/train_clean.tsv', sep='\t', index=False)
dev.to_csv('../../data/dev_clean.tsv', sep='\t', index=False)
test.to_csv('../../data/test_clean.tsv', sep='\t', index=False)


In [13]:
import json
df['category_id'] = df['label'].factorize()[0]
category_id_df = df[['label', 'category_id']].drop_duplicates()
with open('../../data/label2id.json', 'w') as f:
    json.dump({k: v for k, v in zip(category_id_df['label'], category_id_df['category_id'])}, f)
        

In [3]:
df = df[~df['label'].isin(['传记', '经济', '两性关系', '社会科学', '孕产妇育儿', '家庭教育'])]

In [24]:
df['label'].value_counts()

文学         68307
大中专教材教辅    31237
童书         30154
工业技术       15579
中小学教辅      13002
艺术         12106
社会科学       10471
小说         10048
计算机与互联网     9211
建筑          9113
管理          8336
外语学习        7458
科学与自然       6937
历史          6891
法律          5908
政治/军事       5686
哲学/宗教       5013
医学          4524
励志与成功       4426
考试          4099
青春文学        3933
文化          3594
农业/林业       3369
动漫          2346
健身与保健       1955
育儿/家教       1860
烹饪/美食       1714
国学/古籍       1623
旅游/地图       1571
科普读物        1503
孕产/胎教       1391
金融与投资       1075
婚恋与两性        298
Name: label, dtype: int64