# 数据准备

In [66]:
import numpy as np
import pandas as pd

In [67]:
data = np.load(r'D:\model\web\nlp05\data\data.npz', allow_pickle=True)

In [68]:
data.files

['x', 'y']

In [69]:
X,y = data['x'],data['y']
print(X.shape,y.shape)

(28720,) (28720,)


## 词嵌入

In [70]:
crops = pd.Series(X).apply(lambda x: list(map(str,x))).tolist()
crops[0]

['45293',
 '34076',
 '15125',
 '22530',
 '39738',
 '65278',
 '16847',
 '5882',
 '33107',
 '16847',
 '5270',
 '16847',
 '65578']

In [71]:
from gensim.models.word2vec import Word2Vec

In [72]:
w2v_model = Word2Vec(sentences=crops,vector_size=128,min_count=3,window=2)
w2v_model

<gensim.models.word2vec.Word2Vec at 0x16a48116e60>

## 词向量矩阵

In [73]:
# 语料库
vocab_list = w2v_model.wv.index_to_key
len(vocab_list)

25455

In [74]:
word_matrix = np.zeros([len(vocab_list)+1,w2v_model.vector_size],dtype=np.float32)
for i in range(len(vocab_list)):
    word_matrix[i+1,:]=w2v_model.wv[i]
word_matrix = pd.DataFrame(word_matrix,index=['0']+vocab_list).values
word_matrix.shape

(25456, 128)

In [75]:
# 编码后的语料库
key_to_idex = w2v_model.wv.key_to_index

In [76]:
crops_encoder = pd.Series(crops).apply(lambda x:[np.int32(key_to_idex.get(i,-1)+1) for i in x])
crops_encoder

0        [4083, 14181, 10669, 44, 10028, 10020, 541, 0,...
1        [0, 8663, 2428, 44, 584, 723, 5743, 20181, 341...
2                    [351, 773, 145, 44, 0, 563, 1005, 81]
3        [13377, 5842, 721, 7203, 4289, 10645, 44, 14, ...
4           [81, 1246, 44, 3114, 0, 2528, 2528, 1770, 806]
                               ...                        
28715    [76, 8476, 15, 13731, 10688, 560, 4979, 0, 15,...
28716    [1022, 14238, 1448, 7213, 1177, 391, 1022, 142...
28717    [0, 956, 0, 1413, 0, 6218, 4282, 108, 6862, 83...
28718    [76, 916, 0, 4681, 0, 3947, 23867, 25401, 0, 3...
28719    [761, 2, 20157, 136, 22070, 10918, 4608, 25455...
Length: 28720, dtype: object

## 统一句子长度

In [77]:
max_len = 60     # 固定词长
crops_encoder_new = crops_encoder.apply(lambda x: ([0]*max_len+x)[-max_len:])
crops_encoder_new

0        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                               ...                        
28715    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
28716    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
28717    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
28718    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
28719    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Length: 28720, dtype: object

## 划分数据 

In [79]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(crops_encoder_new,y,test_size=0.2,stratify=y, random_state = 42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(22976,) (5744,) (22976,) (5744,)


## 封装到数据集中

In [80]:
X_train_tensor = torch.tensor(X_train.to_list(),dtype=torch.int32)
X_test_tensor = torch.tensor(X_test.to_list(),dtype=torch.int32)
y_train_tensor = torch.tensor(y_train)

train_ds = torch.utils.data.TensorDataset(X_train_tensor,y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_ds,batch_size=32,shuffle=True)

In [81]:
for i in train_ds:
    print(i)
    break

(tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,  2621,   285,  6529,  1555,     8,     0, 13597, 22655,   370,
            8,  6355,   285,     0,   613,     0,    13, 12534,  9425,   164,
            3,     0,  7156, 15485, 14804,     0,  6066,  1287,   720,     0],
       dtype=torch.int32), tensor(0))


# 建模

In [82]:
import torch
class BiLSTM(torch.nn.Module):
    def __init__(self,embeddings=None,hidden_size=16,max_len=60,output_size=2):
        super(BiLSTM, self).__init__()
        vocab_size, embedding_size = embeddings.shape     
        self.embedding = torch.nn.Embedding(
            num_embeddings=vocab_size,       
            embedding_dim=embedding_size,    
            _weight=torch.tensor(embeddings, dtype=torch.float32),    
            padding_idx=0
        )
        self.lstm =torch.nn.LSTM(
            input_size=embedding_size,#词向量大小
            hidden_size=hidden_size,#隐藏神经元
            batch_first=True,
            bidirectional=True,#双向
        )
        self.fc1 = torch.nn.Linear(
            in_features=max_len*2*hidden_size,
            out_features=16
        )
        self.fc2 = torch.nn.Linear(
            in_features=16,
            out_features=output_size,
        )
        self.flatten = torch.nn.Flatten()
    def forward(self,x):
        x = self.embedding(x)
        x,_ = self.lstm(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [83]:
network = BiLSTM(embeddings=word_matrix) #实例模型
loss_fn = torch.nn.CrossEntropyLoss() #损失函数
optimizer = torch.optim.Adam(network.parameters(),lr=0.001) #优化器

In [84]:
network

BiLSTM(
  (embedding): Embedding(25456, 128, padding_idx=0)
  (lstm): LSTM(128, 16, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=1920, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=2, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)

# 构建训练函数

In [85]:
def fif(epochs):
    for epoch in range(epochs):
        for x_batch,label in train_dl:
            y_pre = network(x_batch)
            loss = loss_fn(y_pre,label)
            network.zero_grad()
            loss.backward()
            optimizer.step()
        print('第{}轮训练，最后一批样本的训练损失值为：{}'.format(epoch+1, loss.item()))
fif(epochs=3)

第1轮训练，最后一批样本的训练损失值为：0.10237821191549301
第2轮训练，最后一批样本的训练损失值为：0.15801899135112762
第3轮训练，最后一批样本的训练损失值为：0.10594882071018219


## 模型性能评估

In [86]:
pre = network(X_test_tensor)
result = pre.data.numpy().argmax(axis=1)
acc_test = (result== y_test).mean()
acc_test

0.8687325905292479