In [195]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import gensim
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [251]:

def load_data(data_dir):
    data = []
    labels = []

    # 遍历data目录下的所有txt文件
    for category in os.listdir(data_dir):
        category_path = os.path.join(data_dir, category)
        print(category_path)
        if category_path.endswith(".txt"):
            with open(category_path, "r", encoding="utf-8") as file:
                lines = file.readlines()
                lines = [line.split('\t')[1].strip() for line in lines]
                data += lines
                labels += [category[:-4]] * len(lines)

    return data, labels

def split_train_test_data(data, labels, test_size=0.2, random_state=42):
    # 划分训练集和测试集
    train_data, test_data, train_labels, test_labels = train_test_split(
        data, labels, test_size=test_size, random_state=random_state
    )

    return train_data, test_data, train_labels, test_labels

# 使用示例
data_dir = "THU"  # 数据目录路径
data, labels = load_data(data_dir)
print(len(data), len(labels))
train_data, test_data, train_labels, test_labels = split_train_test_data(data, labels)


THU\体育.txt
THU\娱乐.txt
THU\家居.txt
THU\彩票.txt
THU\房产.txt
THU\教育.txt
THU\时尚.txt
THU\时政.txt
THU\星座.txt
THU\游戏.txt
THU\社会.txt
THU\科技.txt
THU\股票.txt
THU\财经.txt
836075 836075


In [252]:
train_data[1:10]

['女子3年借给老赖150万 讨债无果欲跳楼\n',
 '台阳改建成的小舞台(图)\n',
 '保值相机马上停产 理光R10低价仅售1850\n',
 '《掌上明珠》累坏陶大宇 暴瘦10斤\n',
 '快讯：动视暴雪第一季度净利润5美元同比增32%\n',
 '带笔记本开越野车 龙门寺住持读EMBA(组图)\n',
 '儿童血铅从何来 小心幼儿园室内环境污染\n',
 '王全安：我跟汪小菲是各取所爱皆大欢喜\n',
 '未雨绸缪：高中生留学资金和语言是重点\n']

In [253]:
def pickle_dump(path, obj):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)


def pickle_load(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [198]:
pickle_dump('trd', train_data)
pickle_dump('trl', train_labels)
pickle_dump('ted', test_data)
pickle_dump('tel', test_labels)


In [210]:
sentences = [list(line) for line in train_data]


In [211]:
len(sentences)

668860

In [212]:
word2vec_model = gensim.models.Word2Vec(sentences, vector_size=64, window=5, min_count=1, sg=0)
word2vec_model.wv['<UNK>'] = np.zeros((64))

In [213]:
embedding_matrix = np.zeros((len(word2vec_model.wv.index_to_key), word2vec_model.vector_size))
for i, word in enumerate(word2vec_model.wv.index_to_key):
    embedding_matrix[i] = word2vec_model.wv[word]

In [214]:
embedding_matrix.shape

(5128, 64)

In [215]:
def text_to_vector(text, word2vec_model, max_length):
    words = list(text)  # 将文本拆分为单词
    vector = []
    for word in words:
        if word in word2vec_model.wv:
            vector.append(word2vec_model.wv[word])  # 如果单词在Word2Vec模型中存在，将其词向量添加到序列中
        else:
            vector.append(word2vec_model.wv['<UNK>'])  # 否则，使用<UNK>标记的向量代替未登录词
    # 填充或截断向量以适应固定长度
    if len(vector) < max_length:
        vector.extend([word2vec_model.wv['<UNK>']] * (max_length - len(vector)))  # 填充向量
    else:
        vector = vector[:max_length]  # 截断向量以达到最大长度
    return vector


In [216]:
def text_to_id(text, word2vec_model, max_length):
    words = list(text)  # 将文本拆分为单词
    vector = []
    for word in words:
        if word in word2vec_model.wv:
            vector.append(word2vec_model.wv.key_to_index[word])  # 如果单词在Word2Vec模型中存在，将其词向量添加到序列中
        else:
            vector.append(word2vec_model.wv.key_to_index['<UNK>'])  # 否则，使用<UNK>标记的向量代替未登录词
    # 填充或截断向量以适应固定长度
    if len(vector) < max_length:
        vector.extend([word2vec_model.wv.key_to_index['<UNK>']] * (max_length - len(vector)))  # 填充向量
    else:
        vector = vector[:max_length]  # 截断向量以达到最大长度
    return vector

In [217]:
labelset = list(set(train_labels))

In [218]:
idx2label = dict(enumerate(labelset))
label2idx = dict([(v, k) for k, v in enumerate(labelset)])

In [219]:
X_train = np.array([text_to_id(text, word2vec_model, 30) for text in train_data])
y_train = np.array([label2idx[label] for label in train_labels])

X_test = np.array([text_to_id(text, word2vec_model, 30) for text in test_data])
y_test = np.array([label2idx[label] for label in test_labels])

In [220]:
pickle_dump('wv', word2vec_model)

In [221]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True)
        self.lstm = nn.LSTM(embedding_matrix.shape[1], hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        avg_pool = torch.mean(lstm_out, dim=1)
        output = self.fc(avg_pool)
        return output

In [222]:
hidden_dim = 64
num_classes = len(label2idx)
model = BiLSTMClassifier(embedding_matrix, hidden_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [223]:
X_train_tensor = torch.LongTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.LongTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

In [224]:
model.embedding

Embedding(5128, 64)

In [225]:
y_train_tensor.device

device(type='cpu')

In [226]:
model = model.to('cuda')
X_train_tensor = X_train_tensor.to('cuda')
y_train_tensor = y_train_tensor.to('cuda')
X_test_tensor = X_test_tensor.to('cuda')
y_test_tensor = y_test_tensor.to('cuda')

In [230]:
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
batch_size = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [231]:
model.train()
num_epochs = 5
model.to('cuda')
for epoch in range(num_epochs):
    total_loss = 0.0
    i = 0
    for inputs, labels in train_loader:
        i+=1
        if(i % 100 == 0):
            print('\r batch {}'.format(i))
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}] Loss: {average_loss:.4f}')

 batch 100
 batch 200
 batch 300
 batch 400
 batch 500
 batch 600
 batch 700
 batch 800
 batch 900
 batch 1000
 batch 1100
 batch 1200
 batch 1300
 batch 1400
 batch 1500
 batch 1600
 batch 1700
 batch 1800
 batch 1900
 batch 2000
 batch 2100
 batch 2200
 batch 2300
 batch 2400
 batch 2500
 batch 2600
 batch 2700
 batch 2800
 batch 2900
 batch 3000
 batch 3100
 batch 3200
 batch 3300
 batch 3400
 batch 3500
 batch 3600
 batch 3700
 batch 3800
 batch 3900
 batch 4000
 batch 4100
 batch 4200
 batch 4300
 batch 4400
 batch 4500
 batch 4600
 batch 4700
 batch 4800
 batch 4900
 batch 5000
 batch 5100
 batch 5200
Epoch [1/5] Loss: 0.5673
 batch 100
 batch 200
 batch 300
 batch 400
 batch 500
 batch 600
 batch 700
 batch 800
 batch 900
 batch 1000
 batch 1100
 batch 1200
 batch 1300
 batch 1400
 batch 1500
 batch 1600
 batch 1700
 batch 1800
 batch 1900
 batch 2000
 batch 2100
 batch 2200
 batch 2300
 batch 2400
 batch 2500
 batch 2600
 batch 2700
 batch 2800
 batch 2900
 batch 3000
 batch 31

In [112]:
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}] Loss: {loss.item()}')
    torch.save(model, os.path.join(prefix, f'{epoch}_.pth'))

RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 2.00 GiB total capacity; 393.71 MiB already allocated; 0 bytes free; 1.08 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [248]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor[:1000])
    _, predicted = torch.max(test_outputs, 1)
    predicted = predicted.to('cpu')
    accuracy = accuracy_score(y_test[:1000], predicted.numpy()[:1000])
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 80.40%


In [250]:
print(test_data[10:20])

['戴尔提升收购3PAR价格至每股24.30美元', '美股周一低收中国概念股多数下跌', '黄加李泡世界杯：网络原创成人礼', '国足开创主力从未合练先河', '谁是你榜样(组图)', '全国第一家非公企业党建展览馆', '爱情测试：爱情占你生命中的比重(图)', '8或2012年上市', '汇丰晋信大盘股票基金即将发行', '用于心理援助']


In [249]:
y_pred = predicted.numpy()
label_pred = [idx2label[idx] for idx in y_pred]
print(label_pred[10:20])
print(test_labels[10:20])

['股票', '股票', '科技', '体育', '家居', '时政', '星座', '科技', '财经', '股票']
['科技', '科技', '科技', '体育', '时尚', '家居', '星座', '科技', '财经', '娱乐']


In [236]:
X_test_tensor[1:2].shape
with torch.no_grad():
    test_outputs = model(X_test_tensor[1:2])
    _, predicted = torch.max(test_outputs, 1)
predicted = predicted.to('cpu')

In [245]:
t = text_to_id("讨债无果欲跳楼", word2vec_model, 50)
t = torch.LongTensor([t]).to('cuda')
with torch.no_grad():
    test_outputs = model(t)
    _, predicted = torch.max(test_outputs, 1)
predicted = predicted.to('cpu').numpy()
idx2label[predicted[0]]


'家居'