**문장 토큰화**

In [12]:
from nltk import sent_tokenize
text_sample = "Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should be suitable for many users."
tokenized_sentences = sent_tokenize(text_sample)
print(tokenized_sentences)

['Select your preferences and run the install command.', 'Stable represents the most currently tested and supported version of PyTorch.', 'This should be suitable for many users.']


**단어 토큰화**

In [13]:
from nltk import word_tokenize
sentence = "This book is for deep learning learners"
words = word_tokenize(sentence)
print(words)

['This', 'book', 'is', 'for', 'deep', 'learning', 'learners']


**아포스트로피가 포함된 문장에서 단어 토큰화**

In [14]:
from nltk.tokenize import WordPunctTokenizer
sentence = "it's nothing that you don't already know except most people aren't aware of how their inner world works."
words = WordPunctTokenizer().tokenize(sentence)
print(words)

['it', "'", 's', 'nothing', 'that', 'you', 'don', "'", 't', 'already', 'know', 'except', 'most', 'people', 'aren', "'", 't', 'aware', 'of', 'how', 'their', 'inner', 'world', 'works', '.']


**라이브러리 호출 및 데이터셋 준비**

In [15]:
import csv
from konlpy.tag import Okt
from gensim.models import word2vec

f = open(r"../pytorch/data/ratings_train.txt", "r", encoding="utf-8")
rdr = csv.reader(f, delimiter="\t")
rdw = list(rdr)
f.close()

**오픈 소스 한글 형태소 분석기 호출**

In [16]:
twitter = Okt()

result = []

for line in rdw:
    malist = twitter.pos(line[1], norm=True, stem=True)
    r = []
    for word in malist:
        if not word[1] in ["Josa", "Eomi", "Punctuation"]:
            r.append(word[0])
    rl = (" ".join(r)).strip()
    result.append(rl)
    print(rl)

JVMNotFoundException: No JVM shared library file (jvm.dll) found. Try setting up the JAVA_HOME environment variable properly.

**형태소 저장**

In [None]:
with open("NaverMovie.nlp", "w", encoding="utf-8") as fp:
    fp.write("\n".join(result))

**Word2Vec 모델 생성**

In [None]:
mData = word2vec.LineSentence("NaverMovie.nlp")
mModel = word2vec.Word2Vec(mData, vector_size=200, window=10, hs=1, min_count=2, sg=1)
mModel.save("NaverMovie.model")

**불용어 제거**

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
from nltk.tokenize import word_tokenize

sample_text = "Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should be suitable for many users."
text_tokens = word_tokenize(sample_text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words("english")]
print("불용어 제거 미적용: ", text_tokens, "\n")
print("불용어 제거 적용: ", tokens_without_sw)

**포터 알고리즘**

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

print(stemmer.stem("obesses"), stemmer.stem("obssesed"))
print(stemmer.stem("standardizes"), stemmer.stem("standardization"))
print(stemmer.stem("national"), stemmer.stem("nation"))
print(stemmer.stem("absentness"), stemmer.stem("absently"))
print(stemmer.stem("tribalical"), stemmer.stem("tribalicalized"))

**랭커스터 알고리즘**

In [None]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem("obesses"), stemmer.stem("obssesed"))
print(stemmer.stem("standardizes"), stemmer.stem("standardization"))
print(stemmer.stem("national"), stemmer.stem("nation"))
print(stemmer.stem("absentness"), stemmer.stem("absently"))
print(stemmer.stem("tribalical"), stemmer.stem("tribalicalized"))

**표제어 추출**

In [None]:
import nltk
nltk.download("wordnet")

from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

print(stemmer.stem("obesses"), stemmer.stem("obssesed"))
print(lemma.lemmatize("standardizes"), lemma.lemmatize("standardization"))
print(lemma.lemmatize("national"), lemma.lemmatize("nation"))
print(lemma.lemmatize("absentness"), lemma.lemmatize("absently"))
print(lemma.lemmatize("tribalical"), lemma.lemmatize("tribalicalized"))

**품사 정보가 추가된 표제어 추출**

In [None]:
print(lemma.lemmatize("obesses", "v"), lemma.lemmatize("obssesed", "a"))
print(lemma.lemmatize("standardizes", "v"), lemma.lemmatize("standardization", "n"))
print(lemma.lemmatize("national", "a"), lemma.lemmatize("nation", "n"))
print(lemma.lemmatize("absentness", "n"), lemma.lemmatize("absently", "r"))
print(lemma.lemmatize("tribalical", "a"), lemma.lemmatize("tribalicalized", "v"))