In [1]:
# !pip install torchtext==0.6.0

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import time

import torchtext

In [3]:
TEXT = torchtext.data.Field(lower=True, fix_length=200, batch_first=False)
LABEL = torchtext.data.Field(sequential=False)

In [4]:
from torchtext.datasets import IMDB
train_data, test_data = IMDB.splits(TEXT, LABEL)

In [5]:
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........', 'high.', 'a', 'classic', 'line:', 'inspector:', "i'm", 'here', 'to', 'sack', 'one', 'of', '

## 전처리

In [6]:
import string

for example in train_data.examples:
    text = [x.lower() for x in vars(example)["text"]]
    text = [x.replace("<br", "") for x in text]
    text = ["".join(c for c in s if c not in string.punctuation) for s in text] # 불용어처리
    text = [s for s in text if s] # 공백제거
    vars(example)["text"] = text 
    
for example in test_data.examples:
    text = [x.lower() for x in vars(example)["text"]]
    text = [x.replace("<br", "") for x in text]
    text = ["".join(c for c in s if c not in string.punctuation) for s in text] # 불용어처리
    text = [s for s in text if s] # 공백제거
    vars(example)["text"] = text 

In [7]:
import random
train_Data, valid_data = train_data.split(random_state=random.seed(42), split_ratio=0.8)

In [8]:
TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data) # 답안지
# print(vars(TEXT.vocab)) # 머신러닝에서 bow(bag of word)

In [9]:
BATCH_SIZE = 64
embeding_dim = 100
hidden_size = 300

train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE
)

In [10]:
class RNNCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(RNNCell_Encoder, self).__init__()
        self.rnn = nn.RNNCell(input_dim, hidden_size)

    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros((bz, hidden_size))

        for word in inputs: # 각 단어마다 가중치값을 구해서
            ht = self.rnn(word, ht)
        return ht

class LSTMCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(LSTMCell_Encoder, self).__init__()
        self.lstm = nn.LSTMCell(input_dim, hidden_size)

    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros((bz, hidden_size))
        ct = torch.zeros((bz, hidden_size))
        for word in inputs: # 각 단어마다 가중치값을 구해서
            ht,ct = self.rnn(word, (ht,ct))
        return ht

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(len(TEXT.vocab.stoi), embeding_dim) #100
        self.rnn = LSTMCell_Encoder(embeding_dim, hidden_size) #100->300
        self.fc1 = nn.Linear(hidden_size, 256) #300->256
        self.fc2 = nn.Linear(256, 3)

    def forward(self, x):
        x = self.em(x)
        x = self.rnn(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [11]:
model = Net()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

## 문제 : 네이버리뷰 감정분석 (0,1 분류)
- 단, 머신러닝X
- 한국어처리가능
- LSTM
- colab (GPU)