# # 1안
# 영문: 토큰화, 모든 단어를 기본형으로 변환, 유의미 품사 추출
# 한글: 형태소 분석, 유의미 형태소 추출
# 각 단어별로 장소 임베딩
# # 2안
# word2vec으로 모든 단어를 임베딩한 후,
# k-means clustering으로 군집화하여 군집별로 장소 임베딩

In [3]:
import numpy as np
import pandas as pd
import re
import os
import platform
import sys
import time
import pickle
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import konlpy
from konlpy.tag import Mecab

logging = True

nltk.data.path.append('F:\\소공전프로젝트\\sofcon\\recomm\\nltk_data')
list_csv = ['data/kor/attraction_review_tag.csv',
            'data/kor/hotel_review_tag.csv',
            'data/kor/restaurant_review_tag.csv',
            'data/eng/eng_attraction_review_tag.csv',
            'data/eng/eng_hotel_review_tag.csv',
            'data/eng/eng_restaurant_review_tag.csv']
list_corpus = ['corpus/attraction_tag.list',
               'corpus/hotel_tag.list',
               'corpus/restaurant_tag.list']
try:
    os.stat('corpus')
except:
    os.mkdir('corpus')
def orderset(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

# 한글

In [4]:
if platform.system() == 'Linux':
    mecab = Mecab()
elif platform.system() == 'Windows':
    mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")

In [5]:
# attraction, hotel, or restaurant
test = 0

In [6]:
df = pd.read_csv(list_csv[test])
# filter charset exception
df['review'] = df['review'].apply(lambda x: re.sub(r'[^ 가-힣0-9.!?\n]',' ',x))
# make sentence list
array = df['review'].tolist()
# 한글형태소 분리
list_pos = [mecab.pos(sentence) for sentence in array]
# 형태소 리스트화
morpheme = [mecab.morphs(sentence) for sentence in array]

# 의미를 가지는 형태소만 추출
pattern = re.compile('MM|NNG|VA[+].*|VV[+].*|XR')
df_morpheme = pd.DataFrame(columns = ['placeId','tags'], dtype = 'int64')
taglist = []
for i in range(len(list_pos)):
    pairs = np.array(list_pos[i])
    tags = np.array(morpheme[i])
    npbool = []
    for pair in pairs:
        npbool.append(re.fullmatch(pattern,pair[1])!=None)
    tags = tags[npbool]
    taglist.append(tags.tolist())
df_morpheme['tags'] = taglist
df_morpheme['placeId'] = df['placeId'].astype('int64')

wordlist = []
for l in df_morpheme['tags']:
    wordlist += l
wordset = orderset(wordlist)
print('전체', len(wordlist))
print('집합', len(wordset))

전체 246208
집합 13226


In [12]:
# 병렬처리를 위한 데이터 분할 
import multiprocessing as mp
from multiprocessing import Pool
core_count = mp.cpu_count()
wordsubset = np.array_split(wordset, core_count)

# corpus 생성함수
def mkcorpus(ws):
    for word in ws :
        places = []
        for i in range(len(df_morpheme)):
            if word in df_morpheme['tags'][i]:
                places.append(df_morpheme['placeId'][i])
        corpus.append(places)
        if logging == True:
            print('['+word+']: ',len(places),' places appended to the corpus')
        sys.stdout.flush()

In [None]:
start = time.time()

corpus = []
pool = Pool(core_count)
pool.map(mkcorpus, wordsubset)
pool.close()
pool.join()

print('Elapsed time: ', str(time.time() - start))
# save
with open(list_corpus[test],'wb') as f:
    pickle.dump(corpus, f)

# 영문

In [2]:
# pos태깅은 문장이 문맥을 구성해야 정확한 결과가 나옴
s = 'the quick brown fox jumps over the lazy dog'
token = nltk.word_tokenize(s)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  Searched in:
    - '/home/jahn/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/home/jahn/anaconda3/nltk_data'
    - '/home/jahn/anaconda3/lib/nltk_data'
    - 'F:\\소공전프로젝트\\sofcon\\recomm\\nltk_data'
    - ''
**********************************************************************


In [90]:
st = PorterStemmer()
lm = WordNetLemmatizer()
l = ['eat', 'ate', 'eating', 'eaten']
l2 = [st.stem(w) for w in l]
print(l2)
l3 = [lm.lemmatize(w, pos='v') for w in l2]
print(l3)
l4 = ['i', 'ate', 'a', 'grass','eating', 'chicken']
nltk.pos_tag(l4)

['eat', 'ate', 'eat', 'eaten']
['eat', 'eat', 'eat', 'eat']


[('i', 'NN'), ('ate', 'VBP'), ('a', 'DT'), ('eating', 'NN'), ('cow', 'NN')]

In [51]:
df = pd.read_csv(list_csv[3])
# filter charset exception
df['review'] = df['review'].apply(lambda x: re.sub(r'[^ a-zA-Z0-9.!?\n]',' ',x))
# make sentence list
array = df['review'].tolist()
array

[' The Secret Garden within Changdeokgung I have been to Changdeokgung and the Secret Garden twice now. The best times to go would be in the springtime when all the flowers are in full bloom and the garden is alive with bird song. Autumn is also amazing as the fall foliage looks splendid as it complements the palace buildings. It is one of the five grand palaces within Seoul and is conveniently located nearby other popular tourist attractions. There is a separate entrance fee for Changdeokgung palace and the Secret Garden which boasts pavilions  lily ponds  a stream as well as luscious vegetation. You may only enter the Secret Garden as part of an organised tour. The tours are in Korean  English  Japanese  Chinese. For more detailed information and for many photographs please have a look at my blog post                                           ',
 ' Secret Garden is worth a visit We visited the Secret Garden  and we had the most loveliest guide  Ms Yoon! We chatted a lots after the to