In [29]:
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import accuracy_score
#tokenizer setting 
NUM_WORD = 10000
SENTENCE_LEN = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBED_DIM = 1000
batch_size = 128
import random
random.seed(3344)
np.random.seed(3344)
torch.manual_seed(3344)
torch.cuda.manual_seed(3344)

In [30]:
data_path ='reviews_with_splits_lite.csv'
data = pd.read_csv(data_path,delimiter=',')
#positive reviews 1 negative reviews 0
data['rating']=data['rating'].apply(lambda x: 0 if x=='negative' else 1)
data.head()

Unnamed: 0,rating,review,split
0,0,terrible place to work for i just heard a stor...,train
1,0,"hours , minutes total time for an extremely s...",train
2,0,my less than stellar review is for service . w...,train
3,0,i m granting one star because there s no way t...,train
4,0,the food here is mediocre at best . i went aft...,train


In [31]:
train_data = data[data.split=='train']
val_data = data[data.split=='val']
test_data = data[data.split=='test']

In [32]:
#make tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data.review)
#transform to numerical value
train_seq = tokenizer.texts_to_sequences(train_data.review)
val_seq = tokenizer.texts_to_sequences(val_data.review)
test_seq = tokenizer.texts_to_sequences(test_data.review)

In [33]:
# print(train_seq[0])
x_train = pad_sequences(train_seq,maxlen=SENTENCE_LEN)
print(len(x_train))
x_val = pad_sequences(val_seq,maxlen=SENTENCE_LEN)
x_test = pad_sequences(test_seq,maxlen=SENTENCE_LEN)
#make dataloader 
class text_Dataset(Dataset):
    def __init__(self,data,label):
        self.data = torch.tensor(data).to(torch.int64)
        self.label = label
    def __getitem__(self,index):
        data = self.data[index]
        label = self.label[index]
        return data,label 

    def __len__(self):
        return len(self.data)

#print(len(train_data['rating']),train_data['rating'][39200])
train_dataset = text_Dataset(x_train,train_data.rating.to_numpy())
#print(len(train_dataset),len(train_dataset[0]),train_dataset[19600])
val_dataset = text_Dataset(x_val,val_data.rating.to_numpy())
test_dataset = text_Dataset(x_test,test_data.rating.to_numpy())
train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,shuffle=True)
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False)

39200


In [34]:
#build model
#Text CNN
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim,output_dim,filter_sizes, num_filter=1,
                  dropout=0.2, pad_idx=0):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filter,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        # in_channels：输入的channel，文字都是1
        # out_channels：输出的channel维度
        # fs：每次滑动窗口计算用到几个单词,相当于n-gram中的n
        # for fs in filter_sizes用好几个卷积模型最后concate起来看效果。

        self.fc = nn.Linear(len(filter_sizes) * num_filter, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))  # [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)  # [batch size, 1, sent len, emb dim]
        #print(embedded.shape)
        # 升维是为了和nn.Conv2d的输入维度吻合，把channel列升维。
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        print(conved[0].shape,conved[1].shape,conved[2].shape)
        # conved = [batch size, num_filter, sent len - filter_sizes+1]
        # 有几个filter_sizes就有几个conved

        pooled = [F.max_pool1d(conv,conv.shape[2]).squeeze(2) for conv in conved]  # [batch,num_filter]
        print(pooled[0].shape,pooled[1].shape,pooled[2].shape)
        x_cat=torch.cat(pooled, dim=1)
        print(x_cat.shape)
        cat = self.dropout(x_cat)
        # cat = [batch size, num_filter * len(filter_sizes)]
        # 把 len(filter_sizes)个卷积模型concate起来传到全连接层。
        return self.fc(cat)
        
# model = CNN(SENTENCE_LEN,EMBED_DIM,output_dim=2,filter_sizes=[2,3,4]).to(device)
# model.eval()
class TCNN(nn.Module):
    def __init__(self,embed_dim):
        super(TCNN,self).__init__()
        self.embed = nn.Embedding(NUM_WORD,embed_dim)
        self.conv1 = nn.Conv2d(1,1,3)
        self.conv2 = nn.Conv2d(1,1,3)
        self.conv3 = nn.Conv2d(1,1,3)
        self.fc = nn.Linear(93436,2)
    def forward(self,x):
        out = self.embed(x)
        out = out.unsqueeze(1)
        #print(out.shape)
        out = F.relu(self.conv1(out))
        #print(out.shape)
        out = F.relu(self.conv2(out))
        #print(out.shape)
        out = F.relu(self.conv3(out))
        #print(out.shape)
        out = out.view(out.size()[0],-1)
        #print(out.shape)
        out = self.fc(out)
        return out 
model = TCNN(EMBED_DIM).to(device)
# train setting 
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
epochs = 10

In [35]:
#training loop
step =0
import time
for epoch in range(epochs):
    epoch_start_time = time.time()
    model.train()
    train_acc =0.0
    val_acc =0.0
    train_loss = 0.0
    val_loss =0.0
    for i,data in enumerate(train_dataloader):
        step+=1
        #print(step)
        x = data[0].to(device)
        y = data[1].to(device)
        optimizer.zero_grad()
        out = model(x)
        # print(out)
        # print(y)
        # break
        loss = criterion(out,y.long())
        loss.backward()
        #losses.append(loss.item())
        optimizer.step()
        train_acc +=np.sum(np.argmax(out.cpu().data.numpy(),axis=1)== y.cpu().numpy())
        train_loss +=loss.item()
        #y_pred.append(torch.argmax(out,dim=1))
    #validation
    model.eval()
    with torch.no_grad():
        for i,data in enumerate(val_dataloader):
            valx ,valy = data[0].to(device),data[1].to(device)
            val_pred = model(valx)
            batch_loss = criterion(val_pred,valy.long())
            val_acc +=np.sum(np.argmax(val_pred.cpu().data.numpy(),axis=1)== valy.cpu().numpy())
            val_loss +=batch_loss.item()

        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % \
            (epoch + 1, epochs, time.time()-epoch_start_time, \
             train_acc/train_dataset.__len__(), train_loss/train_dataset.__len__(), val_acc/val_dataset.__len__(), val_loss/val_dataset.__len__()))
    model.train()

[001/010] 68.92 sec(s) Train Acc: 0.726122 Loss: 0.004106 | Val Acc: 0.819881 loss: 0.003265
[002/010] 71.74 sec(s) Train Acc: 0.935332 Loss: 0.001360 | Val Acc: 0.842262 loss: 0.003070
[003/010] 73.48 sec(s) Train Acc: 0.987934 Loss: 0.000296 | Val Acc: 0.845119 loss: 0.004115
[004/010] 73.10 sec(s) Train Acc: 0.996888 Loss: 0.000091 | Val Acc: 0.844048 loss: 0.005253
[005/010] 73.47 sec(s) Train Acc: 0.998367 Loss: 0.000043 | Val Acc: 0.848452 loss: 0.005844
[006/010] 73.47 sec(s) Train Acc: 0.998776 Loss: 0.000027 | Val Acc: 0.846310 loss: 0.006478
[007/010] 73.86 sec(s) Train Acc: 0.999260 Loss: 0.000020 | Val Acc: 0.847619 loss: 0.006858
[008/010] 74.25 sec(s) Train Acc: 0.999337 Loss: 0.000017 | Val Acc: 0.847262 loss: 0.007282
[009/010] 74.46 sec(s) Train Acc: 0.999260 Loss: 0.000014 | Val Acc: 0.846310 loss: 0.007669
[010/010] 76.56 sec(s) Train Acc: 0.999388 Loss: 0.000012 | Val Acc: 0.847619 loss: 0.007909


In [38]:
#test
model.eval()
prediction = []
with torch.no_grad():
    for i,data in enumerate(test_dataloader):
        testx = data[0].to(device)
        test_pred = model(testx)
        test_label = np.argmax(test_pred.cpu().data.numpy(),axis=1)
        for y in test_label:
            prediction.append(y)

acc = accuracy_score(test_data.rating.to_numpy(),prediction)
print('test acc:',acc)

test acc: 0.8505952380952381
