README

本程序要求的包如下：
    ipykernel (为了可以在ipynb中运行)
    tqdm (为了可视化训练的进度)
    gensim (为了导入wiki_word2vec50.bin)
    torch (为了使用pytorch框架)
请确认可以import这四个库

如果只是要验收模型的实验效果，可以直接移步最后标有“验收旧模型”的三个模块
如果要调整参数，训练新模型，可以在调整模型参数后，运行有关的函数定义模块，再依次运行标有“训练新模型”的三个模块

In [947]:
# 模型参数

# 公共参数
sentence_len = 50
embedding_size = 50
batch_size = 50
learning_rate = 0.001
max_epoch = 10
word2vec = gensim.models.KeyedVectors.load_word2vec_format('Dataset/wiki_word2vec_50.bin', binary=True).vectors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# TextCNN参数
kernel_sizes = [3, 5, 7, 9]
kernel_nums = 20

# TextRNN参数
hidden_size = 100
hidden_nums = 2

# TextMLP参数
fc_size = 100


In [45]:
# 数据预处理
# 生成6个json文件，如果修改了sentence_len参数，需要重新运行本段代码，否则直接使用已生成的6个json文件即可

import gensim
import numpy as np
import json

gensim_model = gensim.models.KeyedVectors.load_word2vec_format('Dataset/wiki_word2vec_50.bin', binary=True)
pretrained_word2vec = torch.FloatTensor(gensim_model.vectors)
word2index = {word: i for i, word in enumerate(gensim_model.index_to_key)}

def get_sentences_and_labels(name):
    txt_file = open("./Dataset/"+name+".txt")
    txt_lines = txt_file.readlines()
    txt_file.close()

    sentences = []
    labels = []
    for txt_line in txt_lines:
        txt_line = txt_line.split()
        sentence = []
        for word in txt_line[1:]:
            index = word2index[word] if word in gensim_model.index_to_key else 0; 
            sentence.append(index)
            if len(sentence) == sentence_len:
                break
        if len(sentence) < sentence_len:
            sentence = sentence + [0 for _ in range(sentence_len - len(sentence))]
        sentences.append(sentence)
        labels.append(int(txt_line[0]))
    return sentences, labels

def gen_and_save_sentences_and_labels(name):
    (sentences, labels) = get_sentences_and_labels(name)
    sentence_file = open(name+"_sentences.json", "w")
    json.dump(sentences, sentence_file)
    sentence_file.close()
    label_file = open(name+"_label.json", "w")
    json.dump(labels, label_file)
    label_file.close()

gen_and_save_sentences_and_labels("train")
gen_and_save_sentences_and_labels("validation")
gen_and_save_sentences_and_labels("test")

In [878]:
# TextCNN模型定义

import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=kernel_nums, kernel_size=(kernel_size, embedding_size)) for kernel_size in kernel_sizes])
        self.full_connect = nn.Linear(in_features=len(kernel_sizes)*kernel_nums, out_features=2)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, text):
        embedded = self.embedding(text)
        conveds = [F.relu(conv(embedded.unsqueeze(1)).squeeze(3)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in conveds]
        cat = torch.cat(pooled, dim=1)
        full_connected = self.full_connect(cat)
        output = self.softmax(full_connected)
        return output

In [950]:
# TextRNN_LSTM模型定义

import torch
import torch.nn as nn

class TextRNN_LSTM(nn.Module):
    def __init__(self):
        super(TextRNN_LSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=hidden_nums, bidirectional=True)
        self.full_connect = nn.Linear(in_features=2*hidden_size, out_features=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded.permute(1, 0, 2))
        hidden = hidden.view(hidden_nums, 2, -1, hidden_size)
        cat = torch.cat((hidden[-1, 0], hidden[-1, 1]), dim=1)
        full_connected = self.full_connect(cat)
        output = self.softmax(full_connected)
        return output

In [953]:
# TextRNN_GRU 模型定义

import torch
import torch.nn as nn

class TextRNN_GRU(nn.Module):
    def __init__(self):
        super(TextRNN_GRU, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=hidden_nums, bidirectional=True)
        self.full_connect = nn.Linear(in_features=2*hidden_size, out_features=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded.permute(1, 0, 2))
        hidden = hidden.view(hidden_nums, 2, -1, hidden_size)
        cat = torch.cat((hidden[-1, 0], hidden[-1, 1]), dim=1)
        full_connected = self.full_connect(cat)
        output = self.softmax(full_connected)
        return output

In [948]:
# TextMLP模型定义

import torch
import torch.nn as nn

class TextMLP(nn.Module):
    def __init__(self):
        super(TextMLP, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.full_connect_1 = nn.Linear(in_features=embedding_size*sentence_len, out_features=fc_size)
        self.full_connect_2 = nn.Linear(in_features=fc_size, out_features=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        full_connected_1 = F.relu(self.full_connect_1(embedded.view(-1, embedding_size*sentence_len)))
        full_connected_2 = F.relu(self.full_connect_2(full_connected_1))
        output = self.softmax(full_connected_2)
        return output

In [327]:
# 生成DataLoader的函数定义

import json
import torch
from torch.utils.data import Dataset, DataLoader

class EmoDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, index):
        return torch.tensor(self.sentences[index]), torch.tensor(self.labels[index])

def load_data(name):
    sentence_file = open(name+"_sentences.json")
    sentences = json.load(sentence_file)
    sentence_file.close()

    label_file = open(name+"_label.json")
    labels = json.load(label_file)
    label_file.close()

    dataset = EmoDataset(sentences, labels)
    return DataLoader(dataset, batch_size, shuffle=True)

In [808]:
# 测试与检验的函数定义

import torch

def evaluate(loader):
    P = 0
    total = 0
    TP = 0
    FP = 0
    T = 0

    with torch.no_grad():
        for data in loader:
            sentences, labels = data
            outputs = model.forward(sentences.to(device))
            predicts = torch.argmax(outputs, dim=1)
            total += len(predicts)
            for i in range(len(predicts)):
                if predicts[i] == labels[i]:
                    T += 1
                if predicts[i] == 0:
                    if labels[i] == 0:
                        TP += 1
                    else:
                        FP += 1
                if labels[i] == 0:
                    P += 1
    accuracy = float(T) / total * 100
    precision = float(TP) / (TP+FP)
    recall = float(TP) / P
    f1_score = float(2) / (1/precision + 1/recall)
    print("accuracy = %f%%" % accuracy)
    print("precision = %f" % precision)     
    print("recall = %f" % recall)          
    print("f1-score = %f" % f1_score)
    return f1_score

In [769]:
# 初始化函数定义，一般使用pytorch默认的初始化方法即可，该模块仅在实验比较不同初始化方法的时候使用

import torch

def zero(m):
	if hasattr(m, "weight"):
		torch.nn.init.zeros_(m.weight.data)

def normal(m):
	if hasattr(m, "weight"):
		torch.nn.init.normal_(m.weight.data)

def orthogonal(m):
	if hasattr(m, "weight"):
		torch.nn.init.orthogonal_(m.weight.data)

def init(init_type):
	match init_type:
		case 1:
			f = zero
		case 2:
			f = normal
		case 3:
			f = orthogonal
		case _ :
			return 
	for m in model.children():
		if isinstance(m, nn.Conv2d) or isinstance(m, nn.LSTM) or isinstance(m, nn.Linear):
				f(m)

In [5]:
# 训练新模型————模型选择
# 如果要训练新的模型，在此处选择模型；如果只是要验收已训练的模型的结果，不需要执行该模块

path ="tmp.model"

# model = TextCNN()
model = TextRNN_LSTM()
# model = TextRNN_GRU()
# model = TextMLP()

# 一般情况下，不需要手动初始化，既不需要执行下面的init函数
# init(0)

In [None]:
# 训练新模型————训练

import tqdm
import torch.optim as optim

train_loader = load_data("train")
validation_loader = load_data("validation")
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), learning_rate)
model.to(device)

f1_max = 0
for epoch in range(max_epoch):
    for data in tqdm.tqdm(train_loader):
        sentences, labels = data
        optimizer.zero_grad()
        outputs = model.forward(sentences.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()
    f1 = evaluate(validation_loader)
    if f1 > f1_max:
        f1_max = f1
        torch.save(model, path)
model = torch.load(path)

In [None]:
# 训练新模型————测试

evaluate(load_data("test"))

In [1]:
# 验收模型————即用版
# 主要是对前面一些模型和函数定义的整合，如果直接验收的话，不需要管前面的模块，直接先运行本模块，再在下一模块中选择模型类型，在最后一个模块中进行测试即可

import json
import gensim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# 模型参数

# 公共参数
sentence_len = 50
embedding_size = 50
batch_size = 50
learning_rate = 0.001
max_epoch = 10
word2vec = gensim.models.KeyedVectors.load_word2vec_format('Dataset/wiki_word2vec_50.bin', binary=True).vectors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# TextCNN参数
kernel_sizes = [3, 5, 7, 9]
kernel_nums = 20

# TextRNN参数
hidden_size = 100
hidden_nums = 2

# TextMLP参数
fc_size = 100

# TextCNN模型定义

class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=kernel_nums, kernel_size=(kernel_size, embedding_size)) for kernel_size in kernel_sizes])
        self.full_connect = nn.Linear(in_features=len(kernel_sizes)*kernel_nums, out_features=2)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, text):
        embedded = self.embedding(text)
        conveds = [F.relu(conv(embedded.unsqueeze(1)).squeeze(3)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in conveds]
        cat = torch.cat(pooled, dim=1)
        full_connected = self.full_connect(cat)
        output = self.softmax(full_connected)
        return output

# TextRNN_LSTM模型定义

class TextRNN_LSTM(nn.Module):
    def __init__(self):
        super(TextRNN_LSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=hidden_nums, bidirectional=True)
        self.full_connect = nn.Linear(in_features=2*hidden_size, out_features=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded.permute(1, 0, 2))
        hidden = hidden.view(hidden_nums, 2, -1, hidden_size)
        cat = torch.cat((hidden[-1, 0], hidden[-1, 1]), dim=1)
        full_connected = self.full_connect(cat)
        output = self.softmax(full_connected)
        return output

# TextRNN_GRU 模型定义

class TextRNN_GRU(nn.Module):
    def __init__(self):
        super(TextRNN_GRU, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=hidden_nums, bidirectional=True)
        self.full_connect = nn.Linear(in_features=2*hidden_size, out_features=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded.permute(1, 0, 2))
        hidden = hidden.view(hidden_nums, 2, -1, hidden_size)
        cat = torch.cat((hidden[-1, 0], hidden[-1, 1]), dim=1)
        full_connected = self.full_connect(cat)
        output = self.softmax(full_connected)
        return output

# TextMLP模型定义

class TextMLP(nn.Module):
    def __init__(self):
        super(TextMLP, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(word2vec))
        self.full_connect_1 = nn.Linear(in_features=embedding_size*sentence_len, out_features=fc_size)
        self.full_connect_2 = nn.Linear(in_features=fc_size, out_features=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        full_connected_1 = F.relu(self.full_connect_1(embedded.view(-1, embedding_size*sentence_len)))
        full_connected_2 = F.relu(self.full_connect_2(full_connected_1))
        output = self.softmax(full_connected_2)
        return output

# 生成DataLoader的函数定义

class EmoDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, index):
        return torch.tensor(self.sentences[index]), torch.tensor(self.labels[index])

def load_data(name):
    sentence_file = open(name+"_sentences.json")
    sentences = json.load(sentence_file)
    sentence_file.close()

    label_file = open(name+"_label.json")
    labels = json.load(label_file)
    label_file.close()

    dataset = EmoDataset(sentences, labels)
    return DataLoader(dataset, batch_size, shuffle=True)

# 测试与检验的函数定义

def evaluate(loader):
    P = 0
    total = 0
    TP = 0
    FP = 0
    T = 0

    with torch.no_grad():
        for data in loader:
            sentences, labels = data
            outputs = model.forward(sentences.to(device))
            predicts = torch.argmax(outputs, dim=1)
            total += len(predicts)
            for i in range(len(predicts)):
                if predicts[i] == labels[i]:
                    T += 1
                if predicts[i] == 0:
                    if labels[i] == 0:
                        TP += 1
                    else:
                        FP += 1
                if labels[i] == 0:
                    P += 1
    accuracy = float(T) / total * 100
    precision = float(TP) / (TP+FP)
    recall = float(TP) / P
    f1_score = float(2) / (1/precision + 1/recall)
    print("accuracy = %f%%" % accuracy)
    print("precision = %f" % precision)     
    print("recall = %f" % recall)          
    print("f1-score = %f" % f1_score)
    return f1_score

In [4]:
# 验收旧模型————模型选择

# path = "./TextCNN.model"
# path = "./TextRNN_LSTM.model"
path = "./TextRNN_GRU.model"
# path = "./TextMLP.model"

In [5]:
# 验收旧模型————测试

import torch

model = torch.load(path)
evaluate(load_data("test"))

accuracy = 82.655827%
precision = 0.804124
recall = 0.857143
f1-score = 0.829787


0.8297872340425531