<a href="https://colab.research.google.com/github/DonghaeSuh/NLP_Pytorch/blob/main/Preprocessing/TorchText_English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#torchtext 설치 ( 버전을 맞춰야 torch.data의 Field 사용 가능 )

In [1]:
pip install torchtext==0.6

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import urllib.request
import pandas as pd

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x7f24835114e0>)

In [4]:
df=pd.read_csv('IMDb_Reviews.csv')
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [5]:
len(df)

50000

## 데이터 훈련, 테스트 분리

In [6]:
from torchtext import data# torchtext.data 임포트

## 필드 정의 ( 해당 값대로 데이터가 생성될 예정 )

In [7]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

In [8]:
from torchtext.data import TabularDataset

In [9]:
dataset=TabularDataset(path='IMDb_Reviews.csv',format='CSV',fields=[('text',TEXT),('label',LABEL)],skip_header=True)

In [10]:
import random
SEED=123

In [11]:
train_data,test_data=dataset.split(split_ratio=0.8,stratified=True,strata_field='label',random_state=random.seed(SEED))

In [12]:
print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

훈련 샘플의 개수 : 40000
테스트 샘플의 개수 : 10000


## 값 확인

In [13]:
print(vars(train_data[0]))

{'text': ['if', 'you', 'only', 'see', 'one', 'ernest', 'movie', 'in', 'your', 'life,', 'make', 'it', 'this', 'one!', 'this', 'is', 'by', 'far', 'the', 'best', 'in', 'the', 'series,', 'with', 'its', 'nonstop', 'laughs', 'and', 'clever', 'humor', 'that', 'is', 'suitable', 'for', 'all', 'ages.', 'the', 'other', '"ernest"', 'flicks', 'were', 'good', 'too,', 'but', 'most', 'people', 'tend', 'to', 'get', 'tired', 'of', 'him', 'quickly', '(not', 'me,', 'however.).<br', '/><br', '/>in', 'this', 'movie,', 'ernest', 'p.', 'whorrel', 'is', 'assigned', 'jury', 'duty', 'for', 'a', 'murder', 'case.', 'the', 'murderer,', 'nash,', 'just', 'happens', 'to', 'look', 'exactly', 'like', 'our', 'bumbling', 'hero', 'ernest.', 'mr.', 'nash', 'finds', 'this', 'a', 'good', 'opportunity', 'to', 'escape', 'from', 'jail', 'by', 'knocking', 'him', 'out', 'switching', 'identities', 'with', 'him,', 'and', 'so', 'we', 'get', 'to', 'see', 'how', 'ernest', 'reacts', 'in', 'the', 'slammer.<br', '/><br', '/>a', 'great', '

In [14]:
print(train_data.fields.items())

dict_items([('text', <torchtext.data.field.Field object at 0x7f240b7d8b20>), ('label', <torchtext.data.field.Field object at 0x7f240b7da080>)])


# 단어 사전(Vocabulary) 만들기

In [15]:
TEXT.build_vocab(train_data, min_freq=10, max_size=10000) # min_freq : 최소 빈도 개수 , max_size : 단어 집합의 최대 크기

In [16]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 10002


In [17]:
print(TEXT.vocab.stoi) # <unk>와 <pad>가 붙어서 10002개가 된다. 문자열->index



In [18]:
print(TEXT.vocab.itos) # index->문자열
print(TEXT.vocab.itos[14])
print(TEXT.vocab.itos[194])
print(TEXT.vocab.itos[237])

was
things
world


# DataLoader 만들기

In [19]:
from torchtext.data import Iterator

In [20]:
batch_size = 5

In [21]:
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
test_loader = Iterator(dataset=test_data, batch_size = batch_size)

In [22]:
'''
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),     # dataset
    sort=False,
    repeat=False,
    batch_size=BATCH_SIZE,       # 배치사이즈
    device=device)               # device 지정
'''

"\ndevice = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')\n\nBATCH_SIZE = 32\n\ntrain_iterator, test_iterator = data.BucketIterator.splits(\n    (train_data, test_data),     # dataset\n    sort=False,\n    repeat=False,\n    batch_size=BATCH_SIZE,       # 배치사이즈\n    device=device)               # device 지정\n"

In [23]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

훈련 데이터의 미니 배치 수 : 8000
테스트 데이터의 미니 배치 수 : 2000


In [25]:
batch = next(iter(train_loader))

In [26]:
print(type(batch))

<class 'torchtext.data.batch.Batch'>


In [27]:
print(batch.text)

tensor([[  10,   24,   14,   30,   11,    9,   26, 5031,    6,   66,   17,   46,
          291,    9,   14, 1163,    6,  152,   12,   41],
        [ 618,  192,  309,   36,    5,    2,  857, 6241,  206,    4, 1064,   16,
            2,   99, 1307, 4736,   18,  395, 3919,   16],
        [   9,   61,  195,   33,  113,   32,    0, 2294,   51,   14,  166,  526,
           16,   12,   29,  374,    9,   81,  117,   78],
        [ 245,  321,    4, 1031,   65,  145,  169, 4090,   13, 1788,   25,   66,
            2,  148,  283,   25,   73,  228,   15,    0],
        [   9,  867,   10,   94,   39,  295,    2, 2332,  476, 4694,    8,    2,
            0,    4,    6,   88,   12,  199,   28,    3]])
