In [104]:
from preprocess import *

In [105]:
data, labels = load_data('THUCNews-Title')

In [106]:
len(data),len(labels)

(836075, 836075)

In [107]:
wvm = create_vector(data, labels, 64)
pickle_dump('wvm', wvm)

In [108]:
train_data, test_data, train_labels, test_labels = split_train_test_data(data, labels, test_size=0.3)

In [109]:
spider_test_data, spider_test_labels = test_data, test_labels

In [110]:
labelset = list(set(train_labels))
idx2label = dict(enumerate(labelset))
label2idx = dict([(v, k) for k, v in enumerate(labelset)])

train_data_id = [text_to_id(text, wvm, 30) for text in train_data ]
train_labels_idx = np.array([label2idx[label] for label in train_labels])


test_data_id = [text_to_id(text, wvm, 30) for text in test_data ]
test_labels_idx = np.array([label2idx[label] for label in test_labels])

In [111]:
embedding_matrix = np.zeros((len(wvm.wv.index_to_key), wvm.vector_size))
for i, word in enumerate(wvm.wv.index_to_key):
    embedding_matrix[i] = wvm.wv[word]

In [112]:
pickle_dump('idx2label', idx2label)
pickle_dump('label2idx', label2idx)
pickle_dump('wvm', wvm)
pickle_dump('embedding_matrix', embedding_matrix)

In [113]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True)
        self.lstm = nn.LSTM(embedding_matrix.shape[1], hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        avg_pool = torch.mean(lstm_out, dim=1)
        output = self.fc(avg_pool)
        return output

In [114]:


hidden_dim = 64
num_classes = len(label2idx)
model = BiLSTMClassifier(embedding_matrix, hidden_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [115]:
train_data_id_tensor = torch.LongTensor(train_data_id)
train_labels_idx_tensor = torch.LongTensor(train_labels_idx)

test_data_id_tensor = torch.LongTensor(test_data_id)
test_labels_idx_tensor = torch.LongTensor(test_labels_idx)

In [116]:
model = model.to('cuda')

train_data_id_tensor = train_data_id_tensor.to('cuda')
train_labels_idx_tensor = train_labels_idx_tensor.to('cuda')


test_data_id_tensor = test_data_id_tensor.to('cuda')
test_labels_idx_tensor = test_labels_idx_tensor.to('cuda')

In [117]:
batch_size = 128

train_dataset = torch.utils.data.TensorDataset(train_data_id_tensor, train_labels_idx_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

In [118]:
model.train()
num_epochs = 5
model.to('cuda')
epoch_loss = []
for epoch in range(num_epochs):
    total_loss = 0.0
    i = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        i+=1
        if(i % 1000 == 0):
            print('\r batch: {} loss: {}'.format(i, loss.item()))
    
    average_loss = total_loss / len(train_loader)
    fpath = os.path.join('pth',f'epoch_{epoch}.pth')
    torch.save(model,fpath)
    print(f'Epoch [{epoch + 1}/{num_epochs}] Loss: {average_loss:.4f}')
    epoch_loss.append(average_loss)

 batch: 1000 loss: 0.6418344378471375
 batch: 2000 loss: 0.4478532671928406
 batch: 3000 loss: 0.40352609753608704
 batch: 4000 loss: 0.2960279881954193
Epoch [1/5] Loss: 0.5205
 batch: 1000 loss: 0.4813195466995239
 batch: 2000 loss: 0.2731283903121948
 batch: 3000 loss: 0.27901607751846313
 batch: 4000 loss: 0.23022904992103577
Epoch [2/5] Loss: 0.3485
 batch: 1000 loss: 0.42411744594573975
 batch: 2000 loss: 0.2144313007593155
 batch: 3000 loss: 0.22340022027492523
 batch: 4000 loss: 0.2099044919013977
Epoch [3/5] Loss: 0.3118
 batch: 1000 loss: 0.40490540862083435
 batch: 2000 loss: 0.1889411211013794
 batch: 3000 loss: 0.20495672523975372
 batch: 4000 loss: 0.1910036951303482
Epoch [4/5] Loss: 0.2910
 batch: 1000 loss: 0.3726951479911804
 batch: 2000 loss: 0.16988888382911682
 batch: 3000 loss: 0.1881778985261917
 batch: 4000 loss: 0.18660196661949158
Epoch [5/5] Loss: 0.2762


In [125]:
model.eval()
a = 0
b = 200
with torch.no_grad():
    test_outputs = model(test_data_id_tensor[a:b])
    _, predicted = torch.max(test_outputs, 1)
    predicted = predicted.to('cpu')
    accuracy = accuracy_score(test_labels_idx_tensor.to('cpu')[a:b], predicted.numpy())
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 90.50%


In [126]:
print(idx2label)

{0: '社会', 1: '科技', 2: '娱乐', 3: '时政', 4: '房产', 5: '教育', 6: '时尚', 7: '体育', 8: '财经', 9: '彩票', 10: '星座', 11: '家居', 12: '游戏', 13: '股票'}


In [121]:
model.eval()
a = 0
b = 200
with torch.no_grad():
    test_outputs = model(test_data_id_tensor[a:b])
    _, predicted = torch.max(test_outputs, 1)
    predicted = predicted.to('cpu')
    accuracy = accuracy_score(test_labels_idx_tensor.to('cpu')[a:b], predicted.numpy())
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 90.50%


In [127]:
def predict(text):
    with torch.no_grad():
        test_outputs = model(torch.LongTensor([text_to_id(text, wvm, 30)]).to('cuda'))
        _, predicted = torch.max(test_outputs, 1)
        predicted = predicted.to('cpu').numpy()
        predicted = idx2label[predicted[0]]
    return predicted

In [128]:
spider_test_data[:100]

['沃尔沃第三季度盈利4.23亿美元 超预期',
 '大和维持中国石化H股强于大盘评级',
 '温家宝赴河南考察旱情强调今年兴修农村水利',
 '以色列将释放25名埃及囚犯换回1名以色列人',
 '两市周五放量普涨 形态上破',
 '恐慌气氛继续蔓延 考验2319只是时间问题',
 '直击-三点高空过度杨旭重炮得手 辽宁1-0领先杭州',
 '男篮大名单争夺进入乱世 生死时刻广东帮因何失宠',
 'PS3《无双大蛇Z》PC移植版将11月推出',
 '耗资千万 《明珠三国》特色大揭秘',
 '戴尔提升收购3PAR价格至每股24.30美元',
 '美股周一低收中国概念股多数下跌',
 '黄加李泡世界杯：网络原创成人礼',
 '揭秘首发阵容缘何大变 国足开创主力从未合练先河',
 '各国第一夫人时尚穿衣经 谁是你榜样(组图)',
 '全国第一家非公企业党建展览馆',
 '爱情测试：爱情占你生命中的比重(图)',
 '微软幻灯片泄露天机 Windows 8或2012年上市',
 '汇丰晋信大盘股票基金即将发行',
 '梅婷为灾区学校捐建“心联小屋” 用于心理援助',
 '神奇教练战术却腐朽 穆里尼奥的足球属于20年前？',
 '全球142座摩天楼疑受金融危机影响停建',
 '奢华装饰法 打造一个高品质的家(组图)',
 '和黄获李李嘉诚增持29万股 每股70.093港元',
 '王菲杭州灵隐寺再求子(图)',
 '双头套机不足4K 宾得k-x超低价3920元',
 '天下天天谈：白宫啤酒峰会系奥巴马政治秀',
 '快讯：股指午后强势上扬',
 '《变形金刚塞伯坦之战》宣传视频欣赏',
 '趋势科技第三季净利5500万美元',
 '蓝筹井喷推大盘大涨 震荡或将加剧',
 '房地产业：成交量回升或将导致房价过快上涨',
 '研究发现硫化氢可有效治疗阳痿 有望开发新药',
 '分析师：火灾不会影响Mariner能源生产',
 '西班牙国王授予博斯克侯爵头衔 儿子将来可继承爵位',
 '男子养儿17年方知并非自己亲生 起诉前妻索赔',
 '美银美林指361度销售增长或超预期',
 '国内网游2011年渐趋回暖 业界大佬变高调',
 '国米6场4球王牌联赛0出场！ 标王天才竟已被加帅用废',
 '1亿欧元将成卡卡人生拐点？ 球场外也需要有大智慧',
 'CIT集团破产或使高盛获益10

In [132]:
test_data = ['民营企业进出口规模不断壮大 外贸活力充分展现',
'今年前8个月我国外贸进出口总值27.08万亿元',
'开通倒计时！复兴号智能动车组正式在福厦高铁试跑',
'外交部：李强总理在雅加达应约与岸田文雄简短交谈',
'财政部副部长王东伟：目前年收入10万元以下个人基本不缴个税',
'我国成功发射遥感三十三号03星',
'世界杯半决赛：美国VS德国 塞尔维亚VS加拿大',
'23年金球奖30人候选名单公布 有梅西没C罗内马尔',
'新手4元擒体彩1864万不急兑',
'C罗：和梅西的竞争早就过去了 球迷喜欢这个故事',
'10记三分，巨星对决！东契奇大爆发，斯洛文尼亚埋下隐患',
'郑思维被竖大拇哥！冯彦哲受伤，凤凰组合被淘汰，女单2人出局',
'世界杯四强诞生！FIBA第一人出局！美国队迎挑战，最强黑马崛起',
]

In [133]:
predicted = [predict(d) for d in test_data[:100]]
for t, p in zip(test_data, predicted):
    print(p,'\t' ,t)

科技 	 民营企业进出口规模不断壮大 外贸活力充分展现
时政 	 今年前8个月我国外贸进出口总值27.08万亿元
体育 	 开通倒计时！复兴号智能动车组正式在福厦高铁试跑
时政 	 外交部：李强总理在雅加达应约与岸田文雄简短交谈
体育 	 财政部副部长王东伟：目前年收入10万元以下个人基本不缴个税
时政 	 我国成功发射遥感三十三号03星
彩票 	 世界杯半决赛：美国VS德国 塞尔维亚VS加拿大
体育 	 23年金球奖30人候选名单公布 有梅西没C罗内马尔
科技 	 新手4元擒体彩1864万不急兑
体育 	 C罗：和梅西的竞争早就过去了 球迷喜欢这个故事
