In [36]:
import json
import jieba
import numpy as np
from pypinyin import lazy_pinyin
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras import optimizers
import numpy as np

# Read File

In [37]:
def read_json(file):
    f = open(file,'r')
    data = []
    for i in f:
        data.append(json.loads(i))
    f.close()
    return data

In [38]:
def parse_data(file):
    data = read_json(file)
    topics = []
    sentences = []
    for i in data:
        topics.append(i['label'])
        sentences.append(i['sentence'])
    return topics, sentences

In [39]:
# show topics
data = read_json('labels.json')
dic_topic = {}
for i in data:
    dic_topic[i['label']] = i['label_desc']
dic_topic

{'100': 'news_story',
 '101': 'news_culture',
 '102': 'news_entertainment',
 '103': 'news_sports',
 '104': 'news_finance',
 '106': 'news_house',
 '107': 'news_car',
 '108': 'news_edu',
 '109': 'news_tech',
 '110': 'news_military',
 '112': 'news_travel',
 '113': 'news_world',
 '114': 'news_stock',
 '115': 'news_agriculture',
 '116': 'news_game'}

In [40]:
train_topics, train_sentences = parse_data('train.json')
test_topics, test_sentences = parse_data('dev.json')

# Create Dictionary and Tokenizer of Words That Ever Appear in Either Dataset

In [41]:
def create_dic(text):
    dic = {}
    string = ' '.join(jieba.cut(text, HMM=False))
    list_of_terms = string.split()
    for term in list_of_terms:
        if term not in dic and term.isalpha():
            ls = lazy_pinyin(term)
            dic[term] = ''.join(ls)
    return dic

In [42]:
tot_dic = create_dic(''.join(train_sentences+test_sentences)) # dictionary over both whole dataset
tot_dic

{'上课时': 'shangkeshi',
 '学生': 'xuesheng',
 '手机': 'shouji',
 '响': 'xiang',
 '个': 'ge',
 '不停': 'buting',
 '老师': 'laoshi',
 '一怒之下': 'yinuzhixia',
 '把': 'ba',
 '摔': 'shuai',
 '了': 'le',
 '家长': 'jiazhang',
 '拿': 'na',
 '发票': 'fapiao',
 '让': 'rang',
 '赔': 'pei',
 '大家': 'dajia',
 '怎么': 'zenme',
 '看待': 'kandai',
 '这种': 'zhezhong',
 '事': 'shi',
 '商': 'shang',
 '赢': 'ying',
 '环球': 'huanqiu',
 '股份': 'gufen',
 '有限公司': 'youxiangongsi',
 '关于': 'guanyu',
 '延期': 'yanqi',
 '回复': 'huifu',
 '上海证券交易所': 'shanghaizhengquanjiaoyisuo',
 '对': 'dui',
 '公司': 'gongsi',
 '年': 'nian',
 '年度报告': 'niandubaogao',
 '的': 'de',
 '事后': 'shihou',
 '审核': 'shenhe',
 '问询': 'wenxun',
 '函': 'han',
 '公告': 'gonggao',
 '通过': 'tongguo',
 '中介': 'zhongjie',
 '买': 'mai',
 '二手房': 'ershoufang',
 '首付': 'shoufu',
 '都': 'dou',
 '付': 'fu',
 '现在': 'xianzai',
 '卖家': 'maijia',
 '不想': 'buxiang',
 '卖': 'mai',
 '处理': 'chuli',
 '去': 'qu',
 '俄罗斯': 'eluosi',
 '看': 'kan',
 '世界杯': 'shijiebei',
 '得': 'de',
 '花': 'hua',
 '多少': 'duoshao',
 '钱': 'qian',
 '剃

In [44]:
len(tot_dic)

51647

In [45]:
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [46]:
tot_pinyin = []
for i in tot_dic:
    tot_pinyin.append(tot_dic[i])
pinyin_tokenizer = tokenization(tot_pinyin)
pinyin_vocab_size = len(pinyin_tokenizer.word_index)+1
pinyin_tokenizer.word_index

{'yi': 1,
 'ji': 2,
 'shi': 3,
 'yu': 4,
 'fu': 5,
 'wei': 6,
 'jian': 7,
 'zhi': 8,
 'qi': 9,
 'yan': 10,
 'xi': 11,
 'li': 12,
 'shu': 13,
 'wu': 14,
 'ju': 15,
 'jing': 16,
 'xian': 17,
 'you': 18,
 'jie': 19,
 'xiao': 20,
 'jiao': 21,
 'yuan': 22,
 'zhu': 23,
 'han': 24,
 'ying': 25,
 'jin': 26,
 'lu': 27,
 'gu': 28,
 'di': 29,
 'yao': 30,
 'yin': 31,
 'qian': 32,
 'xu': 33,
 'shishi': 34,
 'xiang': 35,
 'hui': 36,
 'bi': 37,
 'zhen': 38,
 'ba': 39,
 'hu': 40,
 'cheng': 41,
 'he': 42,
 'hao': 43,
 'mi': 44,
 'xun': 45,
 'ke': 46,
 'xie': 47,
 'ya': 48,
 'bo': 49,
 'mo': 50,
 'ye': 51,
 'mei': 52,
 'du': 53,
 'jia': 54,
 'ge': 55,
 'ban': 56,
 'yang': 57,
 'jiang': 58,
 'zheng': 59,
 'yong': 60,
 'feng': 61,
 'dai': 62,
 'xuan': 63,
 'fan': 64,
 'lv': 65,
 'pu': 66,
 'ling': 67,
 'tan': 68,
 'qu': 69,
 'lian': 70,
 'huan': 71,
 'tong': 72,
 'chi': 73,
 'pi': 74,
 'zhang': 75,
 'ai': 76,
 'bian': 77,
 'ni': 78,
 'xin': 79,
 'bu': 80,
 'shou': 81,
 'bei': 82,
 'e': 83,
 'wan': 84,
 'b

In [47]:
pinyin_vocab_size

40887

In [48]:
hanzi_tokenizer = tokenization(tot_dic)
hanzi_vocab_size = len(hanzi_tokenizer.word_index)+1
pinyin_tokenizer.word_index

{'yi': 1,
 'ji': 2,
 'shi': 3,
 'yu': 4,
 'fu': 5,
 'wei': 6,
 'jian': 7,
 'zhi': 8,
 'qi': 9,
 'yan': 10,
 'xi': 11,
 'li': 12,
 'shu': 13,
 'wu': 14,
 'ju': 15,
 'jing': 16,
 'xian': 17,
 'you': 18,
 'jie': 19,
 'xiao': 20,
 'jiao': 21,
 'yuan': 22,
 'zhu': 23,
 'han': 24,
 'ying': 25,
 'jin': 26,
 'lu': 27,
 'gu': 28,
 'di': 29,
 'yao': 30,
 'yin': 31,
 'qian': 32,
 'xu': 33,
 'shishi': 34,
 'xiang': 35,
 'hui': 36,
 'bi': 37,
 'zhen': 38,
 'ba': 39,
 'hu': 40,
 'cheng': 41,
 'he': 42,
 'hao': 43,
 'mi': 44,
 'xun': 45,
 'ke': 46,
 'xie': 47,
 'ya': 48,
 'bo': 49,
 'mo': 50,
 'ye': 51,
 'mei': 52,
 'du': 53,
 'jia': 54,
 'ge': 55,
 'ban': 56,
 'yang': 57,
 'jiang': 58,
 'zheng': 59,
 'yong': 60,
 'feng': 61,
 'dai': 62,
 'xuan': 63,
 'fan': 64,
 'lv': 65,
 'pu': 66,
 'ling': 67,
 'tan': 68,
 'qu': 69,
 'lian': 70,
 'huan': 71,
 'tong': 72,
 'chi': 73,
 'pi': 74,
 'zhang': 75,
 'ai': 76,
 'bian': 77,
 'ni': 78,
 'xin': 79,
 'bu': 80,
 'shou': 81,
 'bei': 82,
 'e': 83,
 'wan': 84,
 'b

In [49]:
hanzi_vocab_size

51297

# Prepare Train and Test Dataframes

In [50]:
def by_topic(topics, sentences):
    dataset_by_topic = {}
    for i in range(len(topics)):
        try:
            dataset_by_topic[topics[i]].append(sentences[i])
        except:
            dataset_by_topic[topics[i]] = [sentences[i]]
    return dataset_by_topic

In [60]:
train_by_topic = by_topic(train_topics, train_sentences)
test_by_topic = by_topic(test_topics, test_sentences)
train_by_topic['108']

['上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？',
 '肥乡区：让文明新风吹进千家万户',
 '学生党买什么笔记本电脑好？',
 '如何看待人民日报发文痛批“沉睡中的大学生:你不失业，天理难容！”这件事？',
 '南京晓庄五年一贯制专转本培训简章 2019年博大五年制专转本辅导',
 '驾驶证考试预约是按什么规则进行排序的？',
 '省“双创计划”项目申报工作启动',
 '关于人工智能方面的研究生专业有哪些？',
 '大揭秘：特种部队“手势语”图解',
 '学校或老师有没有权利没收学生手机？',
 '. Sandro Botticelli',
 '人之初性本善、还是性本恶？',
 '在加拿大可以赖账不还？3万名学生赖帐2亿，加拿大政府一笔勾销',
 '河南灵宝：「实验二小」首届“励耘杯”优秀教师事迹展播（七）',
 '当孩子问读书有什么用？这是我见过最好的回答！',
 '上联：大学生，研究生，博士生，三生有幸。下联应如何对？',
 '九江学院附属医院顺利开展2018年兼职班主任竞聘选拔',
 '整体上讲，中国大学同美国大学比较起来，主要的差距在哪？',
 '有哪些能让人流泪的句子？',
 '济南一小学要设"无作业日"，家长怎么看？',
 '蒲城县林则徐纪念馆李永红喜获渭南市“五一”劳动奖章',
 '如何轻松过四级？',
 '初中生上什么技校？',
 '丰安轮滑，尽显风采',
 '学计算机的已经烂大街了吗？',
 '如何看待有骨气的曹汝霖？',
 '告别午后枯燥，全新南昌教育资讯',
 '初中毕业零基础，去技术学校学计算机编程怎么样？这个行业现在可行吗？',
 '十里春风不如你，小玉为何自杀？',
 '你见过最努力的人现在都混成什么样子了？',
 '有哪些申请签证的技巧？',
 '高一因病在家休学一年，应该做什么事？',
 '“汉语桥”阿富汗赛区比赛落幕',
 '全国职业教育“奥林匹克”开赛 福州三学子摘金',
 '云梦县会融入孝感，成为孝感的一个区吗？',
 '省教育厅公示：南通这些教师拟选拔为优秀青年骨干、中青年学术带头人！资助科研经费4-10万元不等',
 '奥巴马任总统时每年都要去几次的“公立常春藤”大学就是它',
 '云波小学首届“校长杯”校园足球联赛开赛',
 '2018初级会计职称考试题型有哪些？怎么答？',

In [52]:
def X_Y_split(sentences):
    Y = []
    for sentence in sentences:
        Y.append([])
        string = ' '.join(jieba.cut(sentence, HMM=False))
        words = string.split()
        for word in words:
            if word.isalpha():
                Y[-1].append(word)
    X = []
    for sentence in Y:
        X.append([])
        for word in sentence:
            ls = lazy_pinyin(word)
            X[-1].append(''.join(ls))
    return X, Y

In [53]:
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [54]:
X_length = 68 # max number of words in a sentence.
Y_length = 68

def X_Y_by_topic(dataset_by_topic):
    X_by_topic = {}
    Y_by_topic = {}
    for i in dataset_by_topic:
        X, Y = X_Y_split(dataset_by_topic[i])
        X_seq = encode_sequences(pinyin_tokenizer, X_length, X)
        Y_seq = encode_sequences(hanzi_tokenizer, Y_length, Y)
        X_by_topic[i] = X_seq
        Y_by_topic[i] = Y_seq
    return X_by_topic, Y_by_topic

In [55]:
TrainX_by_topic, TrainY_by_topic = X_Y_by_topic(train_by_topic)
TestX_by_topic, TestY_by_topic = X_Y_by_topic(test_by_topic)

In [61]:
TrainX_by_topic['108']

array([[ 5600,  5601,   325, ...,     0,     0,     0],
       [ 1791,    69,   459, ...,     0,     0,     0],
       [ 5601,   226,   257, ...,     0,     0,     0],
       ...,
       [  156,    16,   452, ...,     0,     0,     0],
       [30819, 38083,  4344, ...,     0,     0,     0],
       [38098, 38099,   184, ...,     0,     0,     0]], dtype=int32)

# RNN Models by Topic

In [57]:
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, neurons): # num of neurons is chosen
    model = Sequential()
    model.add(Embedding(in_vocab, neurons, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(neurons))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(neurons, return_sequences=True))
    model.add(Dense(out_vocab, activation='Softmax'))
    return model

In [58]:
models = {}
for i in TrainX_by_topic:
    model = define_model(pinyin_vocab_size, hanzi_vocab_size, X_length, Y_length, 100)
    rms = optimizers.RMSprop(lr=0.001)
    model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
    models[i] = model

# Train Models

Here is a short version for only one topic (108), limited by the computation capability of my computer.

In [62]:
for i in TrainX_by_topic:
    history = models[i].fit(TrainX_by_topic[i], TrainY_by_topic[i], epochs=5, validation_split=0.2)
    break

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Test Models

In [69]:
for i in TrainX_by_topic:
    preds = models[i].predict(TestX_by_topic[i])
    break
preds.shape # 646 sample sentences, at most 68 words in a sentence, 51297 hanzi word choices



(646, 68, 51297)

In [66]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [73]:
preds_classes = np.argmax(preds, axis=2)
preds_classes

array([[  0,   0,   0, ...,   0,   0,   0],
       [333, 333, 333, ...,   0,   0,   0],
       [333, 333, 333, ...,   0,   0,   0],
       ...,
       [333, 333, 333, ...,   0,   0,   0],
       [398, 333, 333, ...,   0,   0,   0],
       [333, 333, 333, ...,   0,   0,   0]])

In [72]:
# restore pinyin tokens into hanzi words
preds_text = []
for i in preds_classes:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], hanzi_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], hanzi_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if (t == None):
                temp.append('')
            else:
                temp.append(t)
        preds_text.append(''.join(temp))

In [74]:
preds_text

['',
 ' ',
 '  ',
 '   ',
 '    ',
 '     ',
 '      ',
 '       ',
 '        ',
 '         ',
 '          ',
 '           ',
 '            ',
 '             ',
 '              ',
 '               ',
 '                ',
 '                 ',
 '                  ',
 '                   ',
 '                    ',
 '                     ',
 '                      ',
 '                       ',
 '                        ',
 '                         ',
 '                          ',
 '                           ',
 '                            ',
 '                             ',
 '                              ',
 '                               ',
 '                                ',
 '                                 ',
 '                                  ',
 '                                   ',
 '                                    ',
 '                                     ',
 '                                      ',
 '                                       ',
 '                  

Full train and test version for all topics.

In [None]:
# train
for i in TrainX_by_topic:
    print('———————————————————— for topic '+i+': '+dic_topic[i]+' ————————————————————')
    history = models[i].fit(TrainX_by_topic[i], TrainY_by_topic[i], epochs=10, validation_split=0.2)

In [None]:
# predict
preds = {}
for i in TrainX_by_topic:
    print('———————————————————— for topic '+i+': '+dic_topic[i]+' ————————————————————')
    preds[i] = models[i].predict(TestX_by_topic[i])