In [1]:
import logging
import os
import sys
import pickle
import time

import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.autograd import Variable
from tqdm import tqdm


from sklearn.metrics import accuracy_score


# test = pd.read_csv(r"corpus\imdb\testData.tsv", header=0, delimiter="\t", quoting=3)


num_epochs = 100
embed_size = 300
num_filter = 128
filter_size = 3
bidirectional = True
batch_size = 256
labels = 2
lr = 0.02
device = torch.device('cuda:0')
use_gpu = True


# cnn和transformer还挺像的，
# cnn应该代表的是基于词向量（定长输入）的类注意力机制

class SentimentNet(nn.Module):
    def __init__(self, embed_size, num_filter, filter_size, weight, labels, use_gpu, **kwargs):
        super(SentimentNet, self).__init__(**kwargs)

        self.use_gpu = use_gpu
        # 话说这个不会是我前面做的单词到词向量的映射表吧？？ 答：不用怀疑，正是
        self.embedding = nn.Embedding.from_pretrained(weight)
        # 防止嵌入层在训练中更新
        self.embedding.weight.requires_grad = False

        # 啥玩意儿这是？
        self.conv1d = nn.Conv1d(embed_size, num_filter, filter_size, padding=1)
        self.activate = F.relu
        self.decoder = nn.Linear(num_filter, labels)


    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        # print(embeddings.shape)
        convolution = self.activate(self.conv1d(embeddings.permute([0, 2, 1])))
        # print(convolution.shape)

        pooling = F.max_pool1d(convolution, kernel_size=convolution.shape[2])

        outputs = self.decoder(pooling.squeeze(dim=2))
        # print(outputs)
        # print(self.decoder)

        return outputs





In [2]:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info(r"running %s" % ''.join(sys.argv))

logging.info('loading data...')


2024-04-18 10:04:54,218: INFO: running D:\SoftWare\Anaconda\envs\pytorch\lib\site-packages\ipykernel_launcher.py-fC:\Users\FlashBlack7\AppData\Roaming\jupyter\runtime\kernel-1f8d313a-b8ba-477c-acd0-b1649f836ecf.json
2024-04-18 10:04:54,219: INFO: loading data...


In [3]:
pickle_file = 'depression_glove_42B_300d.pkl'
[train_features, train_labels, val_features, val_labels, test_features, weight, word_to_idx, idx_to_word, vocab] = pickle.load(open(pickle_file, 'rb'))

In [4]:
net = SentimentNet(embed_size=embed_size, num_filter=num_filter, filter_size=filter_size,weight=weight, labels=labels, use_gpu=use_gpu)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)
train_set = torch.utils.data.TensorDataset(train_features, train_labels)
val_set = torch.utils.data.TensorDataset(val_features, val_labels)
test_set = torch.utils.data.TensorDataset(test_features, )

train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_iter = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)


In [5]:
for epoch in range(num_epochs):
    start = time.time()
    train_loss, val_losses = 0, 0
    train_acc, val_acc = 0, 0
    n, m = 0, 0
    with tqdm(total=len(train_iter), desc='Epoch %d' % epoch) as pbar:
        for feature, label in train_iter:
            n += 1
            net.zero_grad()
            feature = Variable(feature.cuda())
            label = Variable(label.cuda())
            score = net(feature)
            loss = loss_function(score, label)
            loss.backward()
            optimizer.step()
            train_acc += accuracy_score(torch.argmax(score.cpu().data,dim=1), label.cpu())
            # print(label.shape)
            # print(torch.argmax(score.cpu().data,dim=1).shape)
            # exit(0)
            train_loss += loss

            pbar.set_postfix({'epoch': '%d' % (epoch),
                                  'train loss': '%.4f' % (train_loss.data / n),
                                  'train acc': '%.2f' % (train_acc / n)
                                  })
            pbar.update(1)

        with torch.no_grad():
            for val_feature, val_label in val_iter:
                m += 1
                val_feature = val_feature.cuda()
                val_label = val_label.cuda()
                val_score = net(val_feature)
                val_loss = loss_function(val_score, val_label)
                val_acc += accuracy_score(torch.argmax(val_score.cpu().data, dim=1), val_label.cpu())
                val_losses += val_loss
        end = time.time()
        runtime = end - start
        pbar.set_postfix({'epoch': '%d' % (epoch),
                              'train loss': '%.4f' % (train_loss.data / n),
                              'train acc': '%.2f' % (train_acc / n),
                              'val loss': '%.4f' % (val_losses.data / m),
                              'val acc': '%.2f' % (val_acc / m),
                              'time': '%.2f' % (runtime)})

        # tqdm.write('{epoch: %d, train loss: %.4f, train acc: %.2f, val loss: %.4f, val acc: %.2f, time: %.2f}' %
        #       (epoch, train_loss.data / n, train_acc / n, val_losses.data / m, val_acc / m, runtime))

test_pred = []
with torch.no_grad():
    with tqdm(total=len(test_iter), desc='Prediction') as pbar:
        for test_feature, in test_iter:
                test_feature = test_feature.cuda()
                test_score = net(test_feature)
                # test_pred.extent
                test_pred.extend(torch.argmax(test_score.cpu().data, dim=1).numpy().tolist())
                pbar.update(1)

# result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
# result_output.to_csv("./result/cnn.csv", index=False, quoting=3)
# logging.info('result saved!')

Epoch 0: 100%|██████████| 14/14 [00:01<00:00, 11.97it/s, epoch=0, train loss=0.5049, train acc=0.79, val loss=0.4934, val acc=0.81, time=1.17]
Epoch 1: 100%|██████████| 14/14 [00:00<00:00, 31.71it/s, epoch=1, train loss=0.4621, train acc=0.83, val loss=0.4892, val acc=0.81, time=0.44]
Epoch 2: 100%|██████████| 14/14 [00:00<00:00, 31.80it/s, epoch=2, train loss=0.4564, train acc=0.84, val loss=0.4852, val acc=0.81, time=0.44]
Epoch 3: 100%|██████████| 14/14 [00:00<00:00, 31.58it/s, epoch=3, train loss=0.4463, train acc=0.84, val loss=0.4868, val acc=0.81, time=0.44]
Epoch 4: 100%|██████████| 14/14 [00:00<00:00, 31.72it/s, epoch=4, train loss=0.4465, train acc=0.84, val loss=0.4828, val acc=0.81, time=0.44]
Epoch 5: 100%|██████████| 14/14 [00:00<00:00, 31.83it/s, epoch=5, train loss=0.4430, train acc=0.84, val loss=0.4791, val acc=0.81, time=0.44]
Epoch 6: 100%|██████████| 14/14 [00:00<00:00, 31.75it/s, epoch=6, train loss=0.4410, train acc=0.84, val loss=0.4742, val acc=0.81, time=0.44]

In [6]:
# 模型保存
torch.save(net, 'model_cnn.pkl')
# 模型加载
# model = torch.load('model_cnn.pkl')




In [7]:
import numpy as np
import torch
import pandas as pd
# import numpy as np
from torch import nn
from torch.nn import functional as F




from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,  f1_score
import xgboost as xgb
from sklearn.metrics import precision_score,recall_score,roc_auc_score,roc_curve
import pickle


def metrics_sklearn( y_pred_,y_valid):
    """模型对验证集和测试集结果的评分"""
    # 准确率
    accuracy = accuracy_score(y_valid, y_pred_)
    print('Accuracy：%.6f%%' % (accuracy * 100))

    # 精准率
    precision = precision_score(y_valid, y_pred_)
    print('Precision：%.6f%%' % (precision * 100))

    # 召回率
    recall = recall_score(y_valid, y_pred_)
    # print(y_valid)
    # print('-----------------------------')
    # print(y_pred_)
    print('Recall：%.6f%%' % (recall * 100))

    # F1值
    f1 = f1_score(y_valid, y_pred_)
    print('F1：%.6f%%' % (f1 * 100))

    # auc曲线下面积
    auc = roc_auc_score(y_valid, y_pred_)
    print('AUC：%.6f%%' % (auc * 100))

    # ks值
    fpr, tpr, thresholds = roc_curve(y_valid, y_pred_)
    ks = max(abs(fpr - tpr))
    print('KS：%.6f%%' % (ks * 100))


In [8]:
# 启用推理模式
net.eval()

# 前向传播
val_features = val_features.cuda()

with torch.no_grad():
    output = net(val_features)

# 将输出转换为概率
probabilities = torch.nn.functional.softmax(output, dim=1)

# 获取预测结果
_, y_pred_cnn = torch.max(probabilities, 1)

In [9]:
val_labels = val_labels.cpu()
y_pred_cnn = y_pred_cnn.cpu()
print(val_labels.device)

cpu


In [10]:
metrics_sklearn(y_pred_cnn, np.array(val_labels))

Accuracy：85.596708%
Precision：71.034483%
Recall：38.007380%
F1：49.519231%
AUC：67.234524%
KS：34.469048%
