In [11]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
#加载处理好的数据集,每句话是15个词
data = pd.read_csv('sst2/data.csv')

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __len__(self):
        return len(data)

    def __getitem__(self, i):
        #取数据
        x, y = data.iloc[i]

        #以逗号分割x数据,转换为向量
        x = [int(i) for i in x.split(',')]#列表推导式，把字符串x中，根据逗号分隔开来
        x = torch.LongTensor(x).to(device)
        
        #y不需要太特别的处理
        y = int(y)
        y = torch.tensor(y, dtype=torch.long).to(device)
        return x, y


dataset = Dataset()

len(dataset), dataset[0]

(65000,
 (tensor([  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102,
              0,     0,     0,     0,     0], device='cuda:0'),
  tensor(0, device='cuda:0')))

In [13]:
#数据集加载器
loader = DataLoader(dataset=dataset,batch_size=8,shuffle=True,drop_last=True)
len(loader), next(iter(loader))

(8125,
 [tensor([[  101,  1996,  3185,  2003,  8990,  2302,  6123,  1011,  1011,  4988,
            2594,  2030,  3439,  1012,   102],
          [  101,  5248,  1998,  5729, 21407,   102,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101, 19957,   102,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101, 10628,  5876,   102,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101, 23438,   102,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101, 24970,  2015,  1999,  1996,  2396,  1997,  5263, 14489,  1013,
            2128, 29098, 27242,  4490,   102],
          [  101,  1037,  2882,  2778,   102,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101,  2003,  2738,  3653,  6528, 20771,   102,     0,     0,     0,
               0

In [14]:
#全连接神经网络
class Model(torch.nn.Module):

    #模型初始化部分
    def __init__(self):
        super().__init__()

        #词编码层,30522是词的数量,每个词会被编码为100维的向量
        self.embed = torch.nn.Embedding(num_embeddings=30522,
                                        embedding_dim=100)

        #RNN单元
        self.cell = torch.nn.GRUCell(input_size=100, hidden_size=512)

        #线性输出
        self.fc = torch.nn.Linear(in_features=512, out_features=2)

    #定义神经网络计算过程
    def forward(self, x):

        #每个词编码为100维的向量
        #[8, 15] -> [8, 15, 100]
        x = self.embed(x)

        #初始记忆为空
        h = None

        #从前向后读句子中的每一个词
        for i in range(x.shape[1]):
            #[8, 100],[8, 512] -> [8, 512]
            h = self.cell(x[:, i], h)

        #根据最后一个词的记忆,分类整句话
        #[8, 512] -> [8, 2]
        return self.fc(h)


model = Model().to(device)
input_tensor = torch.ones(8, 15).long().to(device)
output = model(input_tensor)

In [15]:
#训练
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fun = torch.nn.CrossEntropyLoss()
    model.train()

    for epoch in range(2):
        for i, (x, y) in enumerate(loader):
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = loss_fun(out, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if i % 2000 == 0:
                acc = (out.argmax(dim=1) == y).sum().item() / len(y)
                print(epoch, i, loss.item(), acc)

    torch.save(model, 'model/7.model')


train()

0 0 0.6796548366546631 0.5
0 2000 0.7838699221611023 0.5
0 4000 0.1984056681394577 1.0
0 6000 0.22403311729431152 0.875
0 8000 0.0984111875295639 1.0
1 0 0.1287842094898224 0.875
1 2000 0.27253472805023193 0.75
1 4000 0.017153438180685043 1.0
1 6000 0.16958963871002197 0.875
1 8000 0.7404332160949707 0.875


In [16]:
#测试
torch.no_grad()
def test():
    model = torch.load('model/7.model')
    model.eval()
    model.to(device)
    correct = 0
    total = 0
    for i in range(100):
        x, y = next(iter(loader))
        x, y = x.to(device), y.to(device)
        out = model(x).argmax(dim=1).to(device)
        correct += (out == y).sum().item()
        total += len(y)
    print(correct / total)
test()

0.93625
