In [6]:
""" 단어를 특성 벡터로 변환하기 """
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [8]:
""" tf-idf로 단어 적합성 평가 """
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf = True, norm = 'l2',
                       smooth_idf = True)
np.set_printoptions(precision = 2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [10]:
""" 리뷰 데이터 가져오기 """
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.loc[0, 'review'][-50:]

's film was made.<br /><br />A total waste of time.'

In [11]:
""" 텍스트 데이터 정제 """
import re
# 파이썬 정규 표현식 라이브러리

def preprocessor(text) : 
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text
# HTML 파싱에 정규식은 비추천이나, 여기서는 그냥 사용
# 대문자 단어가 의미를 갖지 않는다고 가정

print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor('</a>This :) is :( a test :-)!'))

s film was made a total waste of time 
this is a test :) :( :)


In [12]:
""" 공백을 기준으로 문서를 토큰화 """

def tokenizer(text) :
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [3]:
""" 포터 어간 추출법으로 어간 추출하기 """
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text) : 
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [4]:
""" 불용어 제거하기 """
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop = stopwords.words('english')

[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artyrie\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['runner', 'like', 'run', 'run', 'lot']