# word_embedding

In [3]:
import urllib.request
from gensim.models.word2vec import Word2Vec

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
train_data = pd.read_csv('./data/ratings_train.txt' ,sep='\t')
test_data = pd.read_csv('./data/ratings_test.txt', sep='\t')

In [14]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


## 전처리
* NaN값 제거
* 중복제거  
* 한글공백남기기 + 빈값제거
* 토큰화 + 불용어제거 + 빈값제거

### Nan값 제거

In [11]:
train_data.isnull().sum()

id          0
document    5
label       0
dtype: int64

In [13]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

### 중복값 제거

In [16]:
# 중복값 확인
train_data.duplicated(subset=['document']).sum()

3813

In [17]:
train_data.drop_duplicates(subset='document', inplace=True)
test_data.drop_duplicates(subset='document', inplace=True)

In [18]:
# 중복값 제거확인
train_data.duplicated(subset=['document']).sum()

0

### 한글 공백 남기기

In [36]:
import re
def clean_text(docs):
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ ]','',docs)
    text = re.sub('\s+',' ',text)
    return text

In [37]:
train_data['doc_clean'] = train_data['document'].apply(clean_text)
test_data['doc_clean'] = test_data['document'].apply(clean_text)

In [38]:
train_data.sample()

Unnamed: 0,id,document,label,doc_clean
36861,4962111,알파치노 연기 빼고는 볼 게 없다.,0,알파치노 연기 빼고는 볼 게 없다


In [43]:
# 빈값 확인 '', ' '
train_data[train_data['doc_clean']== '']

Unnamed: 0,id,document,label,doc_clean
584,7117896,1,0,
593,6478189,4,0,
638,9364602,4.444.444.444.444.44,0,
668,1600635,General,0,
1559,6918082,dsa,1,
...,...,...,...,...
146607,4602734,2,0,
147059,7089618,......,1,
148550,8588827,asdasdasdasd,0,
149358,6780491,wtf,0,


In [56]:
# 빈값제거
train_data = train_data[train_data['doc_clean']!= '']
train_data = train_data[train_data['doc_clean']!= ' ']

test_data = test_data[test_data['doc_clean']!= '']
test_data = test_data[test_data['doc_clean']!= ' ']

### 토큰화 + 불용어제거 

In [57]:
from konlpy.tag import Komoran
kom = Komoran()

In [58]:
stopword_df = pd.read_csv('./data/stopwords-ko.txt',header = None)
st_list = stopword_df[0].tolist()
st_list

['가',
 '가까스로',
 '가령',
 '각',
 '각각',
 '각자',
 '각종',
 '갖고말하자면',
 '같다',
 '같이',
 '개의치않고',
 '거니와',
 '거바',
 '거의',
 '것',
 '것과 같이',
 '것들',
 '게다가',
 '게우다',
 '겨우',
 '견지에서',
 '결과에 이르다',
 '결국',
 '결론을 낼 수 있다',
 '겸사겸사',
 '고려하면',
 '고로',
 '곧',
 '공동으로',
 '과',
 '과연',
 '관계가 있다',
 '관계없이',
 '관련이 있다',
 '관하여',
 '관한',
 '관해서는',
 '구',
 '구체적으로',
 '구토하다',
 '그',
 '그들',
 '그때',
 '그래',
 '그래도',
 '그래서',
 '그러나',
 '그러니',
 '그러니까',
 '그러면',
 '그러므로',
 '그러한즉',
 '그런 까닭에',
 '그런데',
 '그런즉',
 '그럼',
 '그럼에도 불구하고',
 '그렇게 함으로써',
 '그렇지',
 '그렇지 않다면',
 '그렇지 않으면',
 '그렇지만',
 '그렇지않으면',
 '그리고',
 '그리하여',
 '그만이다',
 '그에 따르는',
 '그위에',
 '그저',
 '그중에서',
 '그치지 않다',
 '근거로',
 '근거하여',
 '기대여',
 '기점으로',
 '기준으로',
 '기타',
 '까닭으로',
 '까악',
 '까지',
 '까지 미치다',
 '까지도',
 '꽈당',
 '끙끙',
 '끼익',
 '나',
 '나머지는',
 '남들',
 '남짓',
 '너',
 '너희',
 '너희들',
 '네',
 '넷',
 '년',
 '논하지 않다',
 '놀라다',
 '누가 알겠는가',
 '누구',
 '다른',
 '다른 방면으로',
 '다만',
 '다섯',
 '다소',
 '다수',
 '다시 말하자면',
 '다시말하면',
 '다음',
 '다음에',
 '다음으로',
 '단지',
 '답다',
 '당신',
 '당장',
 '대로 하다',
 '대하면',
 '대하여',
 '대해 말하자면',
 '대해서',
 '댕그',


In [59]:
kom.morphs(train_data['doc_clean'][0])

['아', '더빙', '진짜', '짜증', '나', '네요', '목소리']

In [60]:
def token_remove_st(docs):
    text = kom.morphs(docs)
    text = [w for w in text if w not in st_list]
    return text

In [61]:
# 2분 걸림 
train_data['doc_clean_remove'] = train_data['doc_clean'].apply(token_remove_st)
test_data['doc_clean_remove'] = test_data['doc_clean'].apply(token_remove_st)

In [62]:
train_data['doc_clean_remove']

0                                     [더빙, 진짜, 짜증, 네요, 목소리]
1             [흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 가볍, 지, 않, 구나]
2                                       [너무재밓었다그래서보는것을추천한다]
3                  [교도소, 이야기, 구먼, 솔직히, 재미, 는, 없, 다, 평점, 조정]
4         [사이몬페그의, 익살, 스럽, ㄴ, 연기, 돋보이, 었, 던, 영화, 스파이더맨, ...
                                ...                        
149995                          [인간, 문제, 지, 소, 는, 뭔, 죄, ㄴ가]
149996                                      [평점, 너무, 낮, 아서]
149997       [게, 뭐, 요, 한국인, 은, 거들먹거리, 고, 필리핀, 혼혈, 은, 착하, 다]
149998                   [청춘 영화, 최고봉, 방황, 우울, 았, 던, 날, 자화상]
149999                 [한국, 영화, 최초, 수, 간, 는, 내용, 담기, ㄴ, 영화]
Name: doc_clean_remove, Length: 145393, dtype: object

In [63]:
tokenized_list = []
for token in train_data['doc_clean_remove']:
    tokenized_list.append(token)
    
tokenized_list

[['더빙', '진짜', '짜증', '네요', '목소리'],
 ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '가볍', '지', '않', '구나'],
 ['너무재밓었다그래서보는것을추천한다'],
 ['교도소', '이야기', '구먼', '솔직히', '재미', '는', '없', '다', '평점', '조정'],
 ['사이몬페그의',
  '익살',
  '스럽',
  'ㄴ',
  '연기',
  '돋보이',
  '었',
  '던',
  '영화',
  '스파이더맨',
  '늙',
  '보이',
  '기',
  '만',
  '았',
  '던',
  '커스틴 던스트',
  '너무나',
  '도',
  '이뻐보였다'],
 ['막', '걸음마', '떼', 'ㄴ', '세', '초등학교', '학년', '생', 'ㄴ', '살용영홬ㅋㅋ별반개도', '아깝', 'ㅁ'],
 ['원작', '긴장감', '제대로', '살리', '내', '지', '못하', '았', '다'],
 ['별',
  '반개',
  '도',
  '아깝',
  '다',
  '욕',
  '나오',
  'ㄴ다',
  '이응경',
  '길용우',
  '연기',
  '생활',
  '인지',
  '정말',
  '발로',
  '아도',
  '그것',
  '보다',
  'ㄴ',
  '낫겟다',
  '납치',
  '감금',
  '만',
  '반복',
  '반복',
  '드라마',
  '는',
  '가족',
  '도',
  '없',
  '다',
  '연기',
  '못하',
  '는',
  '사람',
  '만',
  '모이',
  '엇'],
 ['액션', '없', '는데', '도', '재미', '있', '는', '안', '되', '는', '영화'],
 ['게',
  '평점',
  '낮',
  '은',
  '건',
  '데',
  '꽤',
  '보',
  'ㄹ',
  '만',
  'ㄴ데',
  '헐리우드',
  '식',
  '화려',
  'ㅁ',
  '만',
  '너무',
  '길들이',
  '지',
  '있

## gensim.models 의 Word2Vec 사용해보기

In [69]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_list,window = 5, min_count = 5, workers = 4, sg = 0)

In [70]:
model.wv.vectors.shape

(12862, 100)

In [71]:
print(model.wv.most_similar("최민식"))

[('최민수', 0.8777204155921936), ('공리', 0.8711440563201904), ('한석규', 0.8608067631721497), ('메릴', 0.8525327444076538), ('최지우', 0.8486995100975037), ('박신양', 0.8448445796966553), ('안성기', 0.8441044688224792), ('이정재', 0.8430457711219788), ('김창완', 0.8429348468780518), ('클루니', 0.842752993106842)]


# 사전 훈련된 word2vec사용 해보기

### 직접 다운

In [73]:
import gensim
import urllib.request

# 구글의 사전 훈련된 Word2Vec 모델을 로드. 페이지 사라짐 직접다운 받아야함
# urllib.request.urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", \
#                            filename="GoogleNews-vectors-negative300.bin.gz")

# 다운로드 경로 : https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [79]:
print(word2vec_model.vectors.shape)

(3000000, 300)


In [80]:
word2vec_model.similarity('this', 'is')

0.40797037

In [81]:
word2vec_model['book']

array([ 0.11279297, -0.02612305, -0.04492188,  0.06982422,  0.140625  ,
        0.03039551, -0.04370117,  0.24511719,  0.08740234, -0.05053711,
        0.23144531, -0.07470703,  0.21875   ,  0.03466797, -0.14550781,
        0.05761719,  0.00671387, -0.00701904,  0.13183594, -0.25390625,
        0.14355469, -0.140625  , -0.03564453, -0.21289062, -0.24804688,
        0.04980469, -0.09082031,  0.14453125,  0.05712891, -0.10400391,
       -0.19628906, -0.20507812, -0.27539062,  0.03063965,  0.20117188,
        0.17382812,  0.09130859, -0.10107422,  0.22851562, -0.04077148,
        0.02709961, -0.00106049,  0.02709961,  0.34179688, -0.13183594,
       -0.078125  ,  0.02197266, -0.18847656, -0.17480469, -0.05566406,
       -0.20898438,  0.04858398, -0.07617188, -0.15625   , -0.05419922,
        0.01672363, -0.02722168, -0.11132812, -0.03588867, -0.18359375,
        0.28710938,  0.01757812,  0.02185059, -0.05664062, -0.01251221,
        0.01708984, -0.21777344, -0.06787109,  0.04711914, -0.00

In [84]:
# 16 * 300

vocab_size = 16
embedding_matrix = np.zeros((vocab_size, 300)) # 0 으로 채우는 함수
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [85]:
# google w2v에서 해당 토큰의 vector값을 추출

def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

In [87]:
t.word_index

NameError: name 't' is not defined

In [None]:
for word, i in t.word_index.items():
    temp = get_vector(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [None]:
embedding_matrix.shape

In [None]:
from tensorflow.keras.layers import Input

In [None]:
model = Sequential()
model.add(Input(shape=(4,), dtype='int32'))
e = Embedding(16,300, weights=[embedding_matrix], trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics= ['accuracy'])

In [None]:
model.fit

### import 방식

In [93]:
# pip install tensorflow_hub

In [92]:
# pip install tensorflow_datasets

In [102]:
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow as tf
import keras

In [96]:
# (train_data, validation_data, test_data).info =tfds.load('imdb_reviews', split= ['train[:80%]', 'train[80%:90%]', 'train[90%:]'],\
#                                                         with_info=True, as_supervised=True)

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\bitcamp\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\bitcamp\tensorflow_datasets\imdb_reviews\plain_text\1.0.0.incompleteFHIXL4\imdb_reviews-tra…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\bitcamp\tensorflow_datasets\imdb_reviews\plain_text\1.0.0.incompleteFHIXL4\imdb_reviews-tes…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\bitcamp\tensorflow_datasets\imdb_reviews\plain_text\1.0.0.incompleteFHIXL4\imdb_reviews-uns…

[1mDataset imdb_reviews downloaded and prepared to C:\Users\bitcamp\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


NameError: name 'validation_data' is not defined

In [97]:
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

In [98]:
train_example_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_example_batch

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell 

In [99]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0], dtype=int64)>

In [103]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",
                           input_shape=[], dtype=tf.string)

model = keras.Sequential()
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_1 (KerasLayer)  (None, 50)                48190600  
                                                                 
 dense (Dense)               (None, 16)                816       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 48,191,433
Trainable params: 833
Non-trainable params: 48,190,600
_________________________________________________________________
