In [4]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
import numpy as np

In [5]:
reviews_train = load_files("./aclImdb/train")
text_train, y_train = reviews_train.data, reviews_train.target

reviews_test = load_files("./aclImdb/test")
text_test, y_test = reviews_test.data, reviews_test.target

In [11]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

In [21]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)

In [69]:
nb = MultinomialNB(alpha=15)
nb.fit(X_train, y_train)
pre = nb.predict(X_test)

In [70]:
ac_score = metrics.accuracy_score(y_test, pre)
print(ac_score)

0.8


In [119]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.datasets import load_files
import numpy as np

In [120]:
vect = TfidfVectorizer().fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)

In [121]:
nb = MultinomialNB(alpha=0.5)
nb.fit(X_train, y_train)
pre = nb.predict(X_test)

In [123]:
ac_score = metrics.accuracy_score(y_test, pre)
print(ac_score)

0.807


In [5]:
import konlpy
import pandas as pd
import numpy as np

In [6]:
df_train = pd.read_csv("./sample_data/ratings_train.txt", delimiter = "\t", keep_default_na = False)
df_test = pd.read_csv("./sample_data/ratings_test.txt", delimiter = "\t", keep_default_na = False)

In [7]:
print(df_train.head(5))
print(df_test.head(5))


         id                                           document  label
0   9976970                                아 더빙.. 진짜 짜증나네요 목소리      0
1   3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1
2  10265843                                  너무재밓었다그래서보는것을추천한다      0
3   9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0
4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1
        id                                           document  label
0  6270596                                                굳 ㅋ      1
1  9274899                               GDNTOPCLASSINTHECLUB      0
2  8544678             뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아      0
3  6825595                   지루하지는 않은데 완전 막장임... 돈주고 보기에는....      0
4  6723715  3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??      0


In [10]:
text_train = df_train['document']
y_train = df_train['label']

text_test = df_test['document']
y_test = df_test['label']

print(len(text_train), np.bincount(y_train))
print(len(text_test), np.bincount(y_test))


150000 [75173 74827]
50000 [24827 25173]


In [11]:
from konlpy.tag import Okt
twitter = Okt()

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [25]:
print(twitter.morphs(u"단독입찰보다 복수입찰의 경우"))
print(twitter.nouns("유일하게 항공기 체계 종합개발 경험을 갖고 있는 회사는"))
print(twitter.phrases("날카로운 분석과 신뢰감 있는 진행으로"))

print(twitter.pos("이것도 되나욬ㅋㅋ?"))
print(twitter.pos("이것도 되나욬ㅋㅋ?", norm=True))
print(twitter.pos("이것도 되나욬ㅋㅋ?", norm=True, stem=True))
print(twitter.pos("이것도 되나욬ㅋㅋ?", norm=True, stem=True, join=True))



['단독', '입찰', '보다', '복수', '입찰', '의', '경우']
['항공기', '체계', '종합', '개발', '경험', '회사']
['날카로운 분석', '날카로운 분석과 신뢰감', '날카로운 분석과 신뢰감 있는 진행', '분석', '신뢰', '진행']
[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나욬', 'Noun'), ('ㅋㅋ', 'KoreanParticle'), ('?', 'Punctuation')]
[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나요', 'Verb'), ('ㅋㅋ', 'KoreanParticle'), ('?', 'Punctuation')]
[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되다', 'Verb'), ('ㅋㅋ', 'KoreanParticle'), ('?', 'Punctuation')]
['이/Determiner', '것/Noun', '도/Josa', '되다/Verb', 'ㅋㅋ/KoreanParticle', '?/Punctuation']


In [26]:
# Okt morphs를 사용한 단순한 tokenizer 함수
def twitter_tokenizer(text):
    return twitter.morphs(text)

def twitter_tokenizer_filter(text):
    malist = twitter.pos(text, norm=True, stem=True)
    r = []
    for word in malist:
        if not word[1] in ["Josa", "Eomi", "Punctuation", "KoreanParticle"]:
            r.append(word[0])
    return r

In [27]:
print(twitter_tokenizer("유일하게 항공기 체계 종합개발 경험을 갖고 있는 회사는"))

['유일하게', '항공기', '체계', '종합', '개발', '경험', '을', '갖고', '있는', '회사', '는']


In [29]:
print(twitter_tokenizer_filter("유일하게 항공기 체계 종합개발 경험을 갖고 있는 회사는"))

['유일하다', '항공기', '체계', '종합', '개발', '경험', '갖다', '있다', '회사']


In [30]:
import konlpy
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

from konlpy.tag import Okt
twitter = Okt()

def twitter_tokenizer_filter(text):
    malist = twitter.pos(text, norm=True, stem=True)
    r = []
    for word in malist:
        if not word[1] in ["Josa", "Eomi", "Punctuation", "KoreanParticle"]:
            r.append(word[0])
    return r

In [33]:
df_train = pd.read_csv("./sample_data/ratings_train.txt", delimiter="\t", keep_default_na = False)
df_test = pd.read_csv("./sample_data/ratings_test.txt", delimiter="\t", keep_default_na = False)


In [34]:
text_train = df_train['document']
y_train = df_train['label']

text_test = df_test['document']
y_test = df_test['label']

In [36]:
vect = CountVectorizer(tokenizer = twitter_tokenizer_filter).fit(text_train)

In [50]:
X_train = vect.transform(text_train)
clf_mult = MultinomialNB().fit(X_train, y_train)

In [55]:
X_test = vect.transform(text_test)

In [56]:
pre = clf_mult.predict(X_test)

In [57]:
ac_score = metrics.accuracy_score(y_test, pre)
print(ac_score)

0.8402


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = open("./sample_data/speech.txt", encoding = "ISO8859").read()

wordcloud = WordCloud().generate(text)

print(type(wordcloud))

print(type(wordcloud.word_))

In [None]:
print(wordcloud.word_)

In [None]:
wordcloud = WordCloud(max_fontsize = 70).generate(text)

In [None]:
plt.figure(figsize = (16, 9))

In [None]:
plt.imshow(wordcloud)

In [None]:
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show

In [None]:
# 이미지 예제
