In [1]:
from Korpora import Korpora
import spacy

from torch.utils.data import Dataset

import pandas as pd

In [2]:
nsmc = Korpora.load('nsmc')


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-2\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP

In [3]:
nsmcDF = pd.DataFrame(nsmc.test)

nsmcDF.head()

Unnamed: 0,text,label
0,굳 ㅋ,1
1,GDNTOPCLASSINTHECLUB,0
2,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [4]:
class TextDataset(Dataset):
    def __init__(self, feature, label):
        self.feature = feature
        self.label = label
        self.n_rows = feature.shape[0]

    def __len__(self):
        return self.n_rows
    
    def __getitem__(self, index):
        return self.feature.iloc[index], self.label.iloc[index]
    

In [5]:
nsmcDS = TextDataset(nsmcDF['text'], nsmcDF['label'])

In [6]:
for feature, label in nsmcDS:
    print(feature, label)
    break

굳 ㅋ 1


In [7]:
LANG_MODEL = 'ko_core_news_lg'

nlp = spacy.load(LANG_MODEL)

In [8]:
def generateToken(dataset):
    for text, label in dataset:
        token_list = []
        doc = nlp(text)
        
        for token in doc:
            if (not token.is_punct) and (not token.is_stop):
                token_list.append(str(token))
        yield token_list

In [19]:
token_generator = generateToken(nsmcDS)

i = 0
for token_list in token_generator:
    i += 1
    for token in token_list:
        print(repr(token))
    if i == 5:
        break

'굳'
'ㅋ'
'GDNTOPCLASSINTHECLUB'
'뭐야'
'평점들은'
'나쁘진'
'않지만'
'10점'
'짜리는'
'더더욱'
'아니잖아'
'지루하지는'
'않은데'
'완전'
'막장임'
'돈주고'
'보기에는'
'3D만'
'아니었어도'
'별'
'다섯'
'개'
'줬을텐데'
'왜'
'3D로'
'나와서'
'제'
'심기를'
'불편하게'
'하죠'


In [10]:
token_freqs = {}

for token_list in token_generator:
    for token in token_list:
        if token not in token_freqs:
            token_freqs[token] = 1
        else:
            token_freqs[token] += 1

In [11]:
i = 0
for _ in token_freqs:
    print(_)
    i += 1
    if i == 5:
        break

음악이
주가
된
최고의
음악영화


In [12]:
sorted_tokens = sorted(token_freqs.items(), key = lambda x: x[1], reverse = True)


In [13]:
sorted_tokens[4]

('그냥', 1182)

In [14]:
PAD_TOKEN, OOV_TOKEN = 'PAD', 'OOV'

vocab = {PAD_TOKEN : 0, OOV_TOKEN : 1}

In [15]:
for index, token in enumerate(sorted_tokens, 2):
    vocab[token[0]] = index

In [16]:
i = 0

for key in vocab:
    print(repr(key))
    i += 1
    if i == 5:
        break

'PAD'
'OOV'
'영화'
'너무'
'정말'


In [20]:
encoding_data = []

for token_list in token_generator:
    encoded = []
    for token in token_list:
        encoded.append(vocab[token])
    encoding_data.append(encoded)

encoding_data[:5]

[[800, 29207, 223, 17, 5625],
 [353, 48],
 [573, 29208, 29209, 29210, 29211, 7568, 29212],
 [233,
  29213,
  9247,
  511,
  16912,
  29214,
  3358,
  1662,
  29215,
  29216,
  26,
  1533,
  29217,
  13,
  18,
  29218],
 [11960, 29219, 11961, 426, 16913, 16914, 9248, 143, 254, 11962, 3359]]

In [21]:
data_length = [len(data) for data in encoding_data]

data_length
MAX_LENGTH = max(data_length)

MAX_LENGTH

38

In [22]:
for index, data in enumerate(encoding_data):
    current_length = len(data)
    if current_length < MAX_LENGTH:
        encoding_data[index] = data + ([0] * (MAX_LENGTH - current_length))

In [23]:
for data in encoding_data[:5]:
    print(len(data), data)

38 [800, 29207, 223, 17, 5625, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
38 [353, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
38 [573, 29208, 29209, 29210, 29211, 7568, 29212, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
38 [233, 29213, 9247, 511, 16912, 29214, 3358, 1662, 29215, 29216, 26, 1533, 29217, 13, 18, 29218, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
38 [11960, 29219, 11961, 426, 16913, 16914, 9248, 143, 254, 11962, 3359, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
