## 형태소 분석과 표제어 추출 

In [None]:
# 아나콘다로 spacy 설치
# !conda install -y -c conda-forge spacy

In [None]:
# pip 으로 spacy 설치 
# !pip install -U spacy

In [1]:
import pandas as pd

In [2]:
import spacy
# 영어 자연어처리 라이브러리 

In [3]:
# !python -m spacy download en_core_web_sm
# 영어 모형 다운로드 (spacy로 영어 형태소 분석을 진행하기위함)

In [4]:
nlp = spacy.load('en_core_web_sm')
# 언어별 모델을 불러옴 (영어 자연어처리 모델을 불러옴)

In [5]:
text = 'Wikipedia is maintained by volunteers'

In [6]:
doc = nlp(text)

In [7]:
for token in doc:
    print(token.text,
          token.lemma_, # 표제어
          token.pos_,  # 품사를 간략하게 보여줌
          token.tag_,  # 품사를 자세하게 보여줌 
          token.dep_,  # 문법적 의존 관게 주어냐 목적어냐
          token.is_stop # 불용어 처리되는 단어냐 일반적인 단어냐
         )

Wikipedia Wikipedia PROPN NNP nsubjpass False
is be AUX VBZ auxpass True
maintained maintain VERB VBN ROOT False
by by ADP IN agent True
volunteers volunteer NOUN NNS pobj False


In [8]:
spacy.explain('PROPN')
# 고유명사 

'proper noun'

품사: https://universaldependencies.org/u/pos/all.html
자세한 영어 품사: https://www.clips.uantwerpen.be/pages/mbsp-tags 
의존관계: https://universaldependencies.org/u/dep/index.html

## 특정 품사로 단어 문서 행렬 만들기 

In [19]:
# 명사와 동사만을 추출 하는 함수 구현
def extract_nv(text):
    words = []
    doc = nlp(text)
    for token in doc:
        if token.tag_[0] in 'NV':
            words.append(token.lemma_.lower())
    return words

In [20]:
extract_nv('Apple is a company.')

['apple', 'be', 'company']

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
cv = CountVectorizer(max_features=500, tokenizer=extract_nv)
# 우리가 만든 함수대로 추출, stop words는 안함 

In [24]:
df = pd.read_excel('imdb.xlsx', index_col=0)

In [29]:
df.head(5)

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [30]:
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [25]:
tdm = cv.fit_transform(df['review'])

In [26]:
wc = pd.DataFrame({
    '단어': cv.get_feature_names(), 
    '빈도': tdm.sum(axis=0).flat
})

In [28]:
wc.sort_values('빈도', ascending=False)
# stop_words를 안했기 때문에 be동사가 있음  

Unnamed: 0,단어,빈도
37,be,845
295,movie,211
158,film,189
5,0,137
193,have,133
...,...,...
207,home,2
206,hollywood,2
395,sentiment,2
396,sequel,2
