In [3]:
import jieba
from tqdm import tqdm
import torch


#获取停用词
def get_stopwords(stop_file_name):
    with open(stop_file_name, "r", encoding="utf-8") as file:
        lines=file.readlines()
    words=[i.strip() for i in lines]
    return words

#字符清洗：
def text_cleaning(text):
    text_result=''
    for char in text:
        if (char>='\u4e00' and char<='\u9fa5') :
            text_result+=char
    return text_result

#数据预处理
def co_data(dataset_path,stopwords):
    labels=[]
    labels_idx=[]
    texts=[]

    with open(dataset_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            parts = line.split("_!_")
            labels_idx.append(int(parts[1]))
            labels.append(parts[2])
            texts.append(parts[3])

    #字符清洗
    temp=texts.copy()
    texts=[]
    for text in tqdm(temp):
        result=text_cleaning(text)
        seg=jieba.cut(result, cut_all=False)
        text=[char for char in seg if not char in stopwords]
        texts.append(text)
    
    return texts,labels,labels_idx

dataset_path='./dataset/dataset.txt'
stopwords=get_stopwords('./dataset/cn_stopwords.txt')
#数据预处理
texts,labels,labels_idx=co_data(dataset_path,stopwords)
print(texts[0])

#整理类别和索引
def co_labeldict(labels,labels_idx):
    
    id2labels = {id: label for id, label in zip(labels_idx, labels)}
    labels2id = {label: id for id, label in zip(labels_idx, labels)}
    return id2labels,labels2id
id2labels,labels2id=co_labeldict(labels,labels_idx)
print(id2labels)

100%|██████████| 382688/382688 [01:25<00:00, 4471.62it/s]


['京城', '值得', '来场', '文化', '之旅', '博物馆']
{101: 'news_culture', 102: 'news_entertainment', 103: 'news_sports', 104: 'news_finance', 106: 'news_house', 107: 'news_car', 108: 'news_edu', 109: 'news_tech', 110: 'news_military', 112: 'news_travel', 113: 'news_world', 115: 'news_agriculture', 116: 'news_game', 114: 'stock', 100: 'news_story'}


## 加载库里的模型自己训练

In [2]:
'''
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath
from gensim import utils

model = Word2Vec(texts, vector_size=100, window=5, min_count=1, workers=4)

model.save(r"model/word2vec.model")
model.wv.save_word2vec_format(r'dataset/word2vec.bin', binary=True)
model.wv.save_word2vec_format(r'dataset/word2vec.txt', binary=False)
'''
from gensim.models import Word2Vec

# 加载保存的模型
model = Word2Vec.load("model/word2vec.model")

In [4]:
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np

text2indexs=[]
max_length=max(len(text) for text in texts)
for text in tqdm(texts):
    text2index = [model.wv.key_to_index[word] for word in text if word in model.wv.key_to_index]
    padded_text2index = text2index + [-1] * (max_length - len(text2index))
    text2indexs.append(padded_text2index)
print(text2indexs[0])
print(len(text2indexs[0]))

train_data, rest_data, train_labels, rest_labels = train_test_split(text2indexs, labels_idx, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(rest_data, rest_labels, test_size=0.5, random_state=42)

# 转换为向量并填充索引 -1 为全零向量
def text_to_vector(text, model, embedding_dim=100):
    vector = np.zeros((len(text), embedding_dim))
    for i, index in enumerate(text):
        if index != -1:
            vector[i] = model.wv.vectors[index]
    return vector

train_vec = [text_to_vector(text, model) for text in train_data]
val_vec = [text_to_vector(text, model) for text in val_data]
test_vec = [text_to_vector(text, model) for text in test_data]


# 创建自定义的数据集类
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data)
        self.labels=torch.tensor(labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        return text, label

train_dataset=TextDataset(train_vec,train_labels)
val_dataset=TextDataset(val_vec,val_labels)
test_dataset=TextDataset(test_vec,test_labels)

train_dataloader=DataLoader(train_dataset,32,shuffle=True)
val_dataloader=DataLoader(val_dataset,32,shuffle=False)

100%|██████████| 382688/382688 [00:02<00:00, 168004.87it/s]

[12345, 80, 22741, 227, 1826, 1829, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
55





## 加载训练好的模型

In [5]:
# 加载词嵌入模型
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('./sogou/sgns.sogou.word', binary=False)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

text2indexs=[]
max_length=max(len(text) for text in texts)
for text in tqdm(texts):
    text2index = [model.key_to_index[word] for word in text if word in model.key_to_index]
    padded_text2index = text2index + [-1] * (max_length - len(text2index))
    text2indexs.append(padded_text2index)
print(text2indexs[0])
print(len(text2indexs[0]))

train_data, rest_data, train_labels, rest_labels = train_test_split(text2indexs, labels_idx, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(rest_data, rest_labels, test_size=0.5, random_state=42)

# 转换为向量并填充索引 -1 为全零向量
def text_to_vector(text, model, embedding_dim=300):
    vector = np.zeros((len(text), embedding_dim))
    for i, index in enumerate(text):
        if index != -1:
            vector[i] = model[index]
    return vector

train_vec = [text_to_vector(text, model) for text in train_data]
val_vec = [text_to_vector(text, model) for text in val_data]
test_vec = [text_to_vector(text, model) for text in test_data]


# 创建自定义的数据集类
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data)
        self.labels=torch.tensor(labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        return text, label

train_dataset=TextDataset(train_vec,train_labels)
val_dataset=TextDataset(val_vec,val_labels)
test_dataset=TextDataset(test_vec,test_labels)

train_dataloader=DataLoader(train_dataset,32,shuffle=True)
val_dataloader=DataLoader(val_dataset,32,shuffle=False)

In [None]:
for inputs, labels in train_dataloader:
    
    print(inputs.shape)
    print(type(inputs))
    break

## 因为转换后的向量inputs是浮点型（没转换为int型，担心这样会导致数据有效性降低，因为转换后一些不同的值会相同），所以CNN中无法使用嵌入层，能否改为全连接层或其他层或去掉


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class CNN_LSTM(nn.Module):
    def __init__(self,num_classes,embedding_dim=100) :
        super(CNN_LSTM,self).__init__()
        #嵌入
        self.embedding = nn.Embedding(7000+2, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, 64, kernel_size=3, padding=1)
        self.maxpool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool1d(kernel_size=2)
        self.lstm = nn.LSTM(64, 100, dropout=0.2, batch_first=True)
        self.fc = nn.Linear(100, num_classes)


    def forward(self, inputs):
        embedded = self.embedding(inputs)
        embedded = embedded.permute(0, 2, 1)
        out = self.conv1(embedded)
        out = torch.relu(out)
        out = self.maxpool1(out)
        out = self.conv2(out)
        out = torch.relu(out)
        out = self.maxpool2(out)
        out, _ = self.lstm(out.transpose(1, 2))
        out = out[:, -1, :]  # 取最后一个时间步
        output = self.fc(out)
        return output
    
model = CNN_LSTM (15) # 创建模型实例
loss_fn = nn.CrossEntropyLoss()  # 定义损失函数
optimizer = optim.Adam(model.parameters(), lr=0.001)  # 定义优化器

# 步骤3：训练模型
max_epochs = 10

for epoch in range(max_epochs):
    model.train()  # 设置模型为训练模式
    for inputs, labels in train_dataloader:
        inputs = torch.flatten(inputs, start_dim=1)

        optimizer.zero_grad()  # 梯度清零

        # 前向传播
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        # 反向传播和优化
        loss.backward()
        optimizer.step()

    # 步骤4：评估模型
    model.eval()  # 设置模型为评估模式

    with torch.no_grad():
        total_correct = 0
        total_samples = 0

        for inputs, labels in val_dataloader:
            inputs = torch.flatten(inputs, start_dim=1)
            # 前向传播
            outputs = model(inputs)
            _, predicted = torch.max(outputs, dim=1)

            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        accuracy = total_correct / total_samples
        print(f"Epoch {epoch+1}/{max_epochs}, Validation Accuracy: {accuracy:.4f}")