In [None]:
#LSTM 이란 일종의 RNN 과 유사하게 작동하지만 게이팅 메커니즘이 이를 차별화 시킨다 이 기능은 RNN의 단기 메모리 문제를 해결합니다. 
# Long Short Term Memory
# 바로이전에의 데이터 결과, 몇단계전 데이터결과 값을 넘겨줄지 말지를 필터(게이트)가 결정한다. 
#Vanila 단순RNN

In [None]:
import bz2
from collections import Counter
import re
import nltk
import numpy as np
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/gdrive')

train_file = bz2.BZ2File('/content/gdrive/My Drive/Test/amazone_reviews/train.ft.txt.bz2')
test_file = bz2.BZ2File('/content/gdrive/My Drive/Test/amazone_reviews/test.ft.txt.bz2')
train_file = train_file.readlines()
#readlines()로 파일을 읽으면 한 줄, 한 줄이 각각 리스트의 원소로 들어간다.
#파일 전체가 lines라는 리스트에 담기는 모양. 그 다음엔 sys 모듈을 이용해서 제어가능 .
test_file = test_file.readlines()

print("Number of training reviews: " + str(len(train_file)))
# 훈련데이터 3600000만줄 
print("Number of test reviews: " + str(len(test_file)))
# 테스트 데이터 사십만줄

num_train = 800000
num_test = 200000
# reference environment
train_file = [x.decode('utf-8') for x in train_file[:num_train]]
test_file = [x.decode('utf-8') for x in test_file[:num_test]]

print(train_file[0]) # 훈련데이터 첫줄 출력 




In [None]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ',1)[1][:-1].lower() for x in train_file]

test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ',1)[1][:-1].lower() for x in test_file]

In [None]:
for i in range(len(train_sentences)):
  train_sentences[i] = re.sub('\d','0',train_sentences[i])
  # re.sub('패턴','바꿀문자열','문자열','바꿀횟수') 정규표현식 문자열 변환 

for i in range(len(test_sentences)):
  test_sentences[i] = re.sub('\d','0',test_sentences[i])

In [None]:
for i in range(len(train_sentences)):
  if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
    train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])

for i in range(len(test_sentences)):
  if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
    test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

del train_file, test_file


In [None]:
words = Counter()
for i,sentence in enumerate(train_sentences):
    train_sentences[i] = []
    for word in nltk.word_tokenize(sentence):
      words.update([word.lower()])
      train_sentences[i].append(word)
    if i%20000 == 0:
      print(str((i*100)/num_train)+"% done")
print("100% done")

In [None]:
words = {k:v for k,v in words.items() if v>1}
words = sorted(words, key=words.get, reverse = True)

words = ['_PAD', '_UNK'] + words
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

for i, sentence in enumerate(train_sentences):
  train_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]

for i, sentence in enumerate(test_sentences):
  test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in sentence]

def pad_input(sentences, seq_len):
  features = np.zeros((len(sentences), seq_len), dtype = int)
  for ii, review in enumerate(sentences):
    if len(review) != 0:
      features[ii, -len(review):] = np.array(review)[:seq_len]
  return features

In [None]:
seq_len = 200
train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

test_sentences[0]

split_frac = 0.5
split_id = int(split_frac * len(test_sentences))
val_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 400

train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(val_data, shuffle = True, batch_size = batch_size)
test_loader = DataLoader(test_data, shuffle = True, batch_size = batch_size)
is_cuda = torch.cuda.is_available()
if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("GPU not available, CPU used")

dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print(sample_x.shape, sample_y.shape)



In [None]:
import torch.nn as nn
class SentimentNet(nn.Module):
  def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob = 0.5):
    super(SentimentNet, self).__init__()
    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = drop_prob, batch_first = True)
    self.dropout = nn.Dropout(0.2)
    self.fc = nn.Linear(hidden_dim, output_size)
    self.sigmoid = nn.Sigmoid()
  def forward(self, x, hidden):
    batch_size = x.size(0)
    x = x.long()
    embeds = self.embedding(x)
    lstm_out, hidden = self.lstm(embeds,hidden)
    lstm_out = lstm_out.contiguous().view(-1,self.hidden_dim)

    out = self.dropout(lstm_out)
    out = self.fc(out)
    out = self.sigmoid(out)
    out = out.view(batch_size, -1)
    out = out[:,-1]
    return out, hidden
  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),weight.new(self.n_layers, batch_size,self.hidden_dim).zero_().to(device))
    return hidden


In [None]:
vocab_size = len(word2idx) +1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
print(model)
lr = 0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

epochs = 2
counter = 0
print_every = 1000
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
  h = model.init_hidden(batch_size)
  for inputs, labels in train_loader:
    counter +=1
    h = tuple([e.data for e in h])
    inputs, labels = inputs.to(device), labels.to(device)
    model.zero_grad()
    output, h = model(inputs, h)
    loss = criterion(output.squeeze(), labels.float())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    
    if counter%print_every == 0:
      val_h = model.init_hidden(batch_size)
      val_losses = []
      model.eval()
      for inp, lab in val_loader:
        val_h = tuple([each.data for each in val_h])
        inp, lab = inp.to(device), lab.to(device)
        out, val_h = model(inp,val_h)
        val_loss = criterion(out.squeeze(), lab.float())
        val_losses.append(val_loss.item())
      
      model.train()
      print("Epoch : {}/{}...".format(i+1,epochs),
            "Step : {}...".format(counter),
            "Loss: {:.6f}...".format(np.mean(val_losses)),
            "Val Loss: {:.6f}" .format(np.mean(val_losses)))
      if np.mean(val_losses) <= valid_loss_min:
        torch.save(model.state_dict(),'/content.gdrive/My Drive/Test/amazone_reviews/state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model...'.format(valid_loss_min,np.mean(val_losses)))
        valid_loss_min = np.mean(val_losses)



In [None]:
model.load_state_dict(torch.load('/content.gdrive/My Drive/Test/amazone_reviews/state_dict.pt'))

test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)
model.eval()
for inputs, labels in test_loader:
  h = tuple([each.data for each in h])
  inputs, labels = inputs.to(device), labels.to(device)
  output, h = model(inputs, h)
  test_loss = criterion(output.squeeze(), labels.float())
  test+losses.append(test_loss.item())
  pred = torch.round(output.squeeze())
  correct_tensor = pred.eq(labels.float().vies_as(pred))
  corrent = np.squeeze(correct_tensor.cpu().numpy())
  num_correct += np.num(correct)
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))