# # 1안
# 영문: 토큰화, 모든 단어를 기본형으로 변환, 유의미 품사 추출
# 한글: 형태소 분석, 유의미 형태소 추출
# 각 단어별로 장소 임베딩
# # 2안
# word2vec으로 모든 단어를 임베딩한 후,
# k-means clustering으로 군집화하여 군집별로 장소 임베딩

In [6]:
import numpy as np
import pandas as pd
import re
import os
import pickle
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import konlpy
from konlpy.tag import Mecab

nltk.data.path.append('F:\\소공전프로젝트\\sofcon\\recomm\\nltk_data')

list_csv = ['data/kor/attraction_review_tag.csv',
            'data/kor/hotel_review_tag.csv',
            'data/kor/restaurant_review_tag.csv',
            'data/eng/eng_attraction_review_tag.csv',
            'data/eng/eng_hotel_review_tag.csv',
            'data/eng/eng_restaurant_review_tag.csv']
list_corpus = ['corpus/attraction_tag.list',
               'corpus/hotel_tag.list',
               'corpus/restaurant_tag.list']
try:
    os.stat('corpus')
except:
    os.mkdir('corpus')

In [2]:
df = pd.read_csv(list_csv[0])
# filter charset exception
df['review'] = df['review'].apply(lambda x: re.sub(r'[^ 가-힣0-9.!?\n]',' ',x))
# make sentence list
array = df['review'].tolist()
array
df

Unnamed: 0,placeId,score,review
0,320359,5,가장 아름다운 궁이 아닐까 역사는 둘째하고라도 오랜 기와지붕과 아름드리 나무들속의...
1,320359,4,좋아요! 단풍시즌 한국 궁의 미를 충분히 느낄 수 있는 곳. 데이트하기 좋아요....
2,320359,4,아기자기항 멋이 있는 창덕궁 정말 오랜만에 창덕궁에 다녀왔습니다. 지난달에 경복궁...
3,320359,4,고즈넉한 궁궐 날씨만 좀 따뜻하다면 아기와 산책하기 좋은 곳입니다. 사진찍고 둘러...
4,320359,4,사진촬영 산수유는 이미 꽃이 피었고 목련은 다음주면 피겠더군요. 중국관광객이 없어...
5,320359,5,환상적인 야간의 고궁 서울 도심 야간에 고궁을 갈 수 있는 곳이다 특히 요즘 같...
6,320359,5,좋아요 날이 풀려서 그런지 사람이많지만 곳곳에 숨겨져 조용하게 앉아있을수있는곳도 ...
7,320359,5,달아래 거닐다. 달빛기행! 창덕궁 달빛기행은... 많은 말이 필요없을 정도이므로....
8,320359,5,너무나도 아름다운 우리의 궁 꽃들이 만개하는 4월이 창덕궁을 방문하는 가장 좋은 ...
9,320359,5,세계문화유산인 창덕궁 세계문화유산으로 내국인 뿐 아니라 외국인 방문도 많은 곳입니...


In [20]:
# 한글형태소 분리
mecab = Mecab()
list_pos = [mecab.pos(sentence) for sentence in array]
# 형태소 리스트화
morpheme = [mecab.morphs(sentence) for sentence in array]

In [8]:
# save
with open(list_morpheme[list_csv.index(list_csv[1])],'wb') as f:
    pickle.dump(morpheme, f)

[['가장',
  '아름다운',
  '궁',
  '이',
  '아닐까',
  '역사',
  '는',
  '둘째',
  '하',
  '고라도',
  '오랜',
  '기와지붕',
  '과',
  '아름드리',
  '나무',
  '들',
  '속',
  '의',
  '궁궐',
  '안',
  '을',
  '걷',
  '다',
  '보',
  '면',
  '과거',
  '와',
  '지금',
  '이',
  '하나',
  '로',
  '겹쳐지',
  '는',
  '것',
  '같',
  '아요',
  '설명',
  '도',
  '같이',
  '꼭',
  '들',
  '어',
  '보',
  '시',
  '고',
  '비오',
  '는',
  '날',
  '걸',
  '어',
  '보',
  '시',
  '길'],
 ['좋',
  '아요',
  '!',
  '단풍',
  '시즌',
  '한국',
  '궁',
  '의',
  '미',
  '를',
  '충분히',
  '느낄',
  '수',
  '있',
  '는',
  '곳',
  '.',
  '데이트',
  '하',
  '기',
  '좋',
  '아요',
  '.',
  '비원',
  '은',
  '단풍',
  '시즌',
  '에',
  '예약',
  '하',
  '기',
  '어려우',
  '니',
  '미리',
  '하',
  '는',
  '것',
  '이',
  '좋',
  '아요',
  '.'],
 ['아기자기',
  '항',
  '멋',
  '이',
  '있',
  '는',
  '창덕궁',
  '정말',
  '오랜만',
  '에',
  '창덕궁',
  '에',
  '다녀왔',
  '습니다',
  '.',
  '지난달',
  '에',
  '경복궁',
  '에',
  '갔',
  '을',
  '때',
  '는',
  '너무',
  '넓',
  '고',
  '공사',
  '중',
  '인',
  '데',
  '가',
  '많',
  '아서',
  '별로',
  '였',
  '는데',
  '창덕궁',
  '은',
  '

In [25]:
list_pos

[[('가장', 'MAG'),
  ('아름다운', 'VA+ETM'),
  ('궁', 'NNG'),
  ('이', 'JKC'),
  ('아닐까', 'VCN+EC'),
  ('역사', 'NNG'),
  ('는', 'JX'),
  ('둘째', 'NR'),
  ('하', 'VV'),
  ('고라도', 'EC'),
  ('오랜', 'MM'),
  ('기와지붕', 'NNG'),
  ('과', 'JC'),
  ('아름드리', 'NNG'),
  ('나무', 'NNG'),
  ('들', 'XSN'),
  ('속', 'NNG'),
  ('의', 'JKG'),
  ('궁궐', 'NNG'),
  ('안', 'NNG'),
  ('을', 'JKO'),
  ('걷', 'VV'),
  ('다', 'EC'),
  ('보', 'VX'),
  ('면', 'EC'),
  ('과거', 'NNG'),
  ('와', 'JC'),
  ('지금', 'NNG'),
  ('이', 'JKS'),
  ('하나', 'NR'),
  ('로', 'JKB'),
  ('겹쳐지', 'VV'),
  ('는', 'ETM'),
  ('것', 'NNB'),
  ('같', 'VA'),
  ('아요', 'EF'),
  ('설명', 'NNG'),
  ('도', 'JX'),
  ('같이', 'MAG'),
  ('꼭', 'MAG'),
  ('들', 'VV'),
  ('어', 'EC'),
  ('보', 'VX'),
  ('시', 'EP'),
  ('고', 'EC'),
  ('비오', 'VV'),
  ('는', 'ETM'),
  ('날', 'NNG'),
  ('걸', 'VV'),
  ('어', 'EC'),
  ('보', 'VX'),
  ('시', 'EP'),
  ('길', 'ETN+JKO')],
 [('좋', 'VA'),
  ('아요', 'EF'),
  ('!', 'SF'),
  ('단풍', 'NNG'),
  ('시즌', 'NNG'),
  ('한국', 'NNP'),
  ('궁', 'NNG'),
  ('의', 'JKG'),
  ('미', 'N

In [24]:
# 의미를 가지는 형태소만 추출
pattern = re.compile('MM|NNG|VA[+].*|VV[+].*|XR')
df_morpheme = pd.DataFrame(columns = ['placeId','tags'], dtype = 'int64')
taglist = []
for place in list_pos:
    tag = np.array(place)
    npbool = []
    for t in tag:
        npbool.append(re.fullmatch(pattern,t[1])!=None)
    tag = tag[npbool].tolist()
    taglist.append(tag)
df_morpheme['tags'] = taglist
df_morpheme['placeId'] = df['placeId'].astype('int64')
print(df_morpheme['tags'][0])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

In [36]:
# 의미를 가지는 형태소만 추출
pattern = re.compile('MM|NNG|VA[+].*|VV[+].*|XR')
df_morpheme = pd.DataFrame(columns = ['placeId','tags'], dtype = 'int64')
taglist = []
for i in range(len(list_pos)):
    pairs = np.array(list_pos[i])
    tags = np.array(morpheme[i])
    npbool = []
    for pair in pairs:
        npbool.append(re.fullmatch(pattern,pair[1])!=None)
    tags = tags[npbool]
    taglist.append(tags)
df_morpheme['tags'] = taglist
df_morpheme['placeId'] = df['placeId'].astype('int64')
#print(df_morpheme['tags'][0])

In [37]:
df_morpheme

Unnamed: 0,placeId,tags
0,320359,"[아름다운, 궁, 역사, 오랜, 기와지붕, 아름드리, 나무, 속, 궁궐, 안, 과거..."
1,320359,"[단풍, 시즌, 궁, 미, 느낄, 데이트, 비원, 단풍, 시즌, 예약]"
2,320359,"[아기자기, 멋, 창덕궁, 오랜만, 창덕궁, 다녀왔, 지난달, 경복궁, 갔, 때, ..."
3,320359,"[고즈넉, 궁궐, 날씨, 따뜻, 아기, 산책, 사진, 한복, 무료입장, 행사, 했]"
4,320359,"[사진, 촬영, 산수유, 꽃, 목련, 다음, 관광객, 쾌적, 분위기, 즐길, 한복,..."
5,320359,"[환상, 야간, 고궁, 도심, 야간, 고궁, 갈, 곳, 날씨, 사람, 곳, 야간, ..."
6,320359,"[날, 풀려서, 그런지, 사람, 곳곳, 숨겨져, 조용, 곳, 시간, 맞춰, 무료, ..."
7,320359,"[달, 아래, 달빛, 기행, 창덕궁, 달빛, 기행, 말, 필요, 정도, 이런, 획득..."
8,320359,"[아름다운, 궁, 꽃, 만개, 창덕궁, 방문, 시기, 꽃길, 사색, 즐겨, 창경궁,..."
9,320359,"[세계, 문화유산, 창덕궁, 세계, 문화유산, 내국, 외국인, 방문, 별도, 예약,..."


In [None]:
for csv in list_csv:
    df = pd.read_csv(csv)
    #리뷰단위 분할
    text = ''
    text = text.join([review+'\n' for review in df['review']])
    text = re.sub(r'[^ 가-힣0-9\n]',' ',text)
    text = re.sub(r' +',' ',text)
    array = text.split('\n')[:]
    #형태소 분리
    from konlpy.tag import Mecab
    mecab = Mecab()
    tagged = [mecab.pos(sentence) for sentence in array]
    #corpus 저장
    import pickle
    corpus = [mecab.morphs(sentence) for sentence in array]
    with open(list_corpus[list_csv.index(list_csv[1])],'wb') as f:
        pickle.dump(corpus, f)

In [2]:
# pos태깅은 문장이 문맥을 구성해야 정확한 결과가 나옴
s = 'the quick brown fox jumps over the lazy dog'
token = nltk.word_tokenize(s)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  Searched in:
    - '/home/jahn/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/home/jahn/anaconda3/nltk_data'
    - '/home/jahn/anaconda3/lib/nltk_data'
    - 'F:\\소공전프로젝트\\sofcon\\recomm\\nltk_data'
    - ''
**********************************************************************


In [90]:
st = PorterStemmer()
lm = WordNetLemmatizer()
l = ['eat', 'ate', 'eating', 'eaten']
l2 = [st.stem(w) for w in l]
print(l2)
l3 = [lm.lemmatize(w, pos='v') for w in l2]
print(l3)
l4 = ['i', 'ate', 'a', 'grass','eating', 'chicken']
nltk.pos_tag(l4)

['eat', 'ate', 'eat', 'eaten']
['eat', 'eat', 'eat', 'eat']


[('i', 'NN'), ('ate', 'VBP'), ('a', 'DT'), ('eating', 'NN'), ('cow', 'NN')]

In [51]:
df = pd.read_csv(list_csv[3])
# filter charset exception
df['review'] = df['review'].apply(lambda x: re.sub(r'[^ a-zA-Z0-9.!?\n]',' ',x))
# make sentence list
array = df['review'].tolist()
array

[' The Secret Garden within Changdeokgung I have been to Changdeokgung and the Secret Garden twice now. The best times to go would be in the springtime when all the flowers are in full bloom and the garden is alive with bird song. Autumn is also amazing as the fall foliage looks splendid as it complements the palace buildings. It is one of the five grand palaces within Seoul and is conveniently located nearby other popular tourist attractions. There is a separate entrance fee for Changdeokgung palace and the Secret Garden which boasts pavilions  lily ponds  a stream as well as luscious vegetation. You may only enter the Secret Garden as part of an organised tour. The tours are in Korean  English  Japanese  Chinese. For more detailed information and for many photographs please have a look at my blog post                                           ',
 ' Secret Garden is worth a visit We visited the Secret Garden  and we had the most loveliest guide  Ms Yoon! We chatted a lots after the to