# 영화 리뷰 워드 임베딩 (Word2Vec, FastText)
- gensim 라이브러리 사용 : pip install gensim
    - Word2Vec : models.Word2Vec
    - FastText : models.FastText

## 1. 데이터 준비
* 토큰화가 잘 되어 있는 filtered 데이터 사용

In [1]:
data_filename = '../data/Korean_movie_reviews_2016_filtered.csv'
import pandas as pd
review_df = pd.read_csv(data_filename)
review_df.head()

Unnamed: 0,review,rate
0,아니 딴 그렇 비 비탄 총 대체 왜 들 온겨,7
1,진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임,1
2,역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 ...,10
3,온종일 불편한 피 범벅 일,6
4,답답함 극치 움직일 잇으 좀 움직여 어지간히 좀비 봣으 얼 타고 때려 잡 때 되 않냐,1


In [2]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788189 entries, 0 to 788188
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   review  785448 non-null  object
 1   rate    788189 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 12.0+ MB


In [3]:
# review만 모아서 review별 토큰 리스트로 변환 : review가 Object 타입이므로 str로 변환 후 split
review_list = list(map(str, review_df.review))
corpus = [review.split() for review in review_list]
corpus[:5]

[['아니', '딴', '그렇', '비', '비탄', '총', '대체', '왜', '들', '온겨'],
 ['진심',
  '쓰레기',
  '영화',
  '만들',
  '무서',
  '알',
  '쫄아',
  '틀었',
  '이건',
  '뭐',
  '웃',
  '거리',
  '없는',
  '쓰레기',
  '영화',
  '임'],
 ['역대',
  '좀비',
  '영화',
  '가장',
  '최고다',
  '원작',
  '만화',
  '읽어',
  '보려',
  '영화',
  '보고',
  '결정',
  '하려',
  '감독',
  '간츠',
  '실사',
  '했',
  '사람',
  '거르려',
  '그냥',
  '봤',
  '정말',
  '흠잡',
  '없는',
  '최고',
  '좀비',
  '영화',
  '잔인',
  '거',
  '싫어하지',
  '참고',
  '볼',
  '만하',
  '로미',
  '인물',
  '왜',
  '그런',
  '모르'],
 ['온종일', '불편한', '피', '범벅', '일'],
 ['답답함',
  '극치',
  '움직일',
  '잇으',
  '좀',
  '움직여',
  '어지간히',
  '좀비',
  '봣으',
  '얼',
  '타고',
  '때려',
  '잡',
  '때',
  '되',
  '않냐']]

## 1. Word2Vec 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/word2vec.html

### Skipgram, negative=10 인 경우

In [4]:
# Word2Vec 모델 생성 및 학습 : window=3, min_count=3
from gensim.models import Word2Vec
model_sg_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=10)

In [5]:
# 단어의 임베딩 벡터 확인
model_sg_n10.wv['이정재']

array([-0.2857495 ,  0.2041555 ,  0.00964935, -0.3466797 , -0.12995438,
       -0.16521463,  0.08838039,  0.32900947,  0.8577018 ,  0.40441948,
       -0.11653413, -0.22829176, -0.3088963 ,  0.3411743 , -0.12888728,
       -0.03756718, -0.1297913 , -0.40257752,  0.23977976, -0.3157538 ,
        0.19983093,  0.1197575 ,  0.37147328,  0.05561617, -0.3811317 ,
       -0.21775424,  0.39606118, -0.07680409,  0.15875994,  0.17253746,
       -0.15413189, -0.03328072,  0.0358777 , -0.2429904 ,  0.11532903,
       -0.63700235, -0.06543942,  0.3125222 , -0.3854036 , -0.17876723,
       -0.65203285, -0.07850089, -0.01915997, -0.04932186, -0.06416125,
        0.43478614,  0.12140304,  0.43182498, -0.05035996,  0.34718275,
       -0.4561823 , -0.39909023, -0.03146241, -0.22521956,  0.39830002,
        0.1359353 , -0.07153057, -0.49451658,  0.42327765,  0.23935264,
       -0.86361855, -0.25209746,  0.05516429, -0.24055849, -0.7592467 ,
       -0.04165914,  0.01918496, -0.5590329 , -0.18699469, -0.13

In [6]:
# 단어의 임베딩 벡터 차원 확인
len(model_sg_n10.wv['이정재'])

100

In [7]:
# 두 단어 간 유사도 확인
model_sg_n10.wv.similarity('이정재', '정우성')

0.7543214

In [8]:
# 특정 단어와 유사한 단어 추출
model_sg_n10.wv.most_similar('이정재', topn=20)

[('송강호', 0.8218687772750854),
 ('이범수', 0.8198887705802917),
 ('공유', 0.8027775883674622),
 ('김범수', 0.7615217566490173),
 ('정우성', 0.7543214559555054),
 ('이병헌', 0.7448803186416626),
 ('주지훈', 0.7351187467575073),
 ('이성민', 0.7343079447746277),
 ('리암', 0.7237441539764404),
 ('김윤석', 0.7197758555412292),
 ('이진욱', 0.7184373140335083),
 ('황정민', 0.7164900898933411),
 ('조재현', 0.715961754322052),
 ('박해일', 0.7154378890991211),
 ('김남길', 0.7152894139289856),
 ('김명민', 0.7142781615257263),
 ('요한', 0.7118542194366455),
 ('유준상', 0.710614800453186),
 ('윤제문', 0.7049500942230225),
 ('슨', 0.6999393105506897)]

In [9]:
model_sg_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.9083648324012756),
 ('재밋음', 0.8342860341072083),
 ('재밌네', 0.8229107856750488),
 ('재밌었', 0.8205946683883667),
 ('재밌어', 0.809160590171814),
 ('잼남', 0.804125189781189),
 ('재밋엇음', 0.793635368347168),
 ('잼슴', 0.7764171957969666),
 ('재밋었음', 0.7763556838035583),
 ('재미있었', 0.7761563658714294),
 ('재밌아', 0.7741618752479553),
 ('쟈밋', 0.7715330719947815),
 ('재밋어용', 0.7704077959060669),
 ('재밋네', 0.7691944241523743),
 ('재밋엇', 0.7673618197441101),
 ('재밋엇어용', 0.7659081816673279),
 ('엇', 0.7608101963996887),
 ('재밋어', 0.758557915687561),
 ('재미나다', 0.7583403587341309),
 ('재밋네용', 0.7502185702323914)]

### Skipgram, negative=5 인 경우

In [10]:
# 모델 생성
model_sg_n5 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=5)

In [11]:
# 특어 단어와 유사한 단어 추출 : 이정재
model_sg_n5.wv.most_similar('이정재', topn=20)

[('이범수', 0.8193183541297913),
 ('공유', 0.813062310218811),
 ('송강호', 0.8126512765884399),
 ('김범수', 0.7787740230560303),
 ('이병헌', 0.7603065371513367),
 ('정우성', 0.7425631880760193),
 ('이성민', 0.7301666140556335),
 ('곽도원', 0.7247475981712341),
 ('리암', 0.7206172943115234),
 ('김윤석', 0.7179773449897766),
 ('조재현', 0.7171250581741333),
 ('김남길', 0.7130842804908752),
 ('마동석', 0.7120956182479858),
 ('박해일', 0.7115474343299866),
 ('요한', 0.7110087275505066),
 ('김성균', 0.7021018266677856),
 ('정진영', 0.7001343965530396),
 ('김명민', 0.6957204341888428),
 ('송광호', 0.6954960227012634),
 ('이진욱', 0.6942128539085388)]

In [12]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_sg_n5.wv.most_similar('재밌', topn=20)

[('재미있', 0.8888339996337891),
 ('재밋음', 0.8320459127426147),
 ('재밌네', 0.8228422999382019),
 ('재밌어', 0.8108429312705994),
 ('잼남', 0.8098973035812378),
 ('재밌었', 0.8012561798095703),
 ('재밋엇음', 0.7839922904968262),
 ('재밋네용', 0.7740107178688049),
 ('재밋어용', 0.7734435796737671),
 ('쟈밋', 0.7696585655212402),
 ('잼슴', 0.7677738070487976),
 ('재밋네', 0.7668101191520691),
 ('재밋엇어용', 0.7590724229812622),
 ('재미있었', 0.7587430477142334),
 ('재밋엇', 0.7573145627975464),
 ('재밋었음', 0.7566503286361694),
 ('재밋게봣습니', 0.755557656288147),
 ('재밋었어', 0.752659022808075),
 ('재밋어', 0.7525746822357178),
 ('재밌슴', 0.752352774143219)]

### CBOW, negative=10 인 경우

In [13]:
model_cbow_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=10)

In [14]:
model_cbow_n10.wv.most_similar('이정재', topn=20)

[('이범수', 0.792064905166626),
 ('김윤석', 0.7613535523414612),
 ('공유', 0.7481589317321777),
 ('조재현', 0.7372958064079285),
 ('이성민', 0.7174745202064514),
 ('송강호', 0.706345796585083),
 ('김범수', 0.7003628611564636),
 ('이진욱', 0.6988153457641602),
 ('주지훈', 0.687874436378479),
 ('김남길', 0.6671895384788513),
 ('박해일', 0.6645047068595886),
 ('김성균', 0.6630656719207764),
 ('김성오', 0.6574780344963074),
 ('남자배우', 0.6424132585525513),
 ('정우성', 0.6420602798461914),
 ('민호', 0.637634813785553),
 ('윌스미스', 0.6373593211174011),
 ('곽도원', 0.6369593739509583),
 ('마동석', 0.6354605555534363),
 ('하정우', 0.6341755390167236)]

In [15]:
model_cbow_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.903179407119751),
 ('재밌네', 0.8119001984596252),
 ('재밋음', 0.7982114553451538),
 ('재밌어', 0.7977094054222107),
 ('재밌었', 0.7758068442344666),
 ('잼남', 0.7126374840736389),
 ('재밌는', 0.7050560712814331),
 ('재미있었', 0.7048016786575317),
 ('재밋어', 0.6967854499816895),
 ('재밋엇어', 0.6954159140586853),
 ('재미있네', 0.6935418844223022),
 ('재밌던', 0.6866356730461121),
 ('재밌더', 0.6818627119064331),
 ('재미있어', 0.6699695587158203),
 ('재밋네', 0.667828381061554),
 ('재밋었', 0.6546658873558044),
 ('재밋엇', 0.6468985080718994),
 ('재밌다', 0.6430769562721252),
 ('재밋', 0.6418989300727844),
 ('재밌고', 0.6347743272781372)]

### CBOW, negative=5 인 경우

In [16]:
model_cbow_n5 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=5)

In [17]:
model_cbow_n5.wv.most_similar('이정재', topn=20)

[('이범수', 0.795637309551239),
 ('공유', 0.7630905508995056),
 ('김윤석', 0.755340039730072),
 ('송강호', 0.7312877774238586),
 ('조재현', 0.7231386303901672),
 ('이성민', 0.7068467736244202),
 ('주지훈', 0.6940118074417114),
 ('김범수', 0.6875907778739929),
 ('김남길', 0.6687005162239075),
 ('박해일', 0.6651220321655273),
 ('황정민', 0.6613084673881531),
 ('이진욱', 0.6607698202133179),
 ('곽도원', 0.6556199789047241),
 ('정우성', 0.6545529961585999),
 ('김성오', 0.6545476913452148),
 ('마동석', 0.6520127654075623),
 ('이병헌', 0.6480023860931396),
 ('엄지원', 0.6367975473403931),
 ('송광호', 0.6345449686050415),
 ('강예원', 0.6295062899589539)]

In [18]:
model_cbow_n5.wv.most_similar('재밌', topn=20)

[('재미있', 0.9073307514190674),
 ('재밌네', 0.8120021224021912),
 ('재밌어', 0.8061206936836243),
 ('재밋음', 0.8056749701499939),
 ('재밌었', 0.7961089611053467),
 ('재밋어', 0.7521906495094299),
 ('재밌는', 0.724742591381073),
 ('재미있었', 0.7100330591201782),
 ('재미있네', 0.707988977432251),
 ('잼남', 0.7016727924346924),
 ('재미있어', 0.6956740021705627),
 ('재밌더', 0.6938090920448303),
 ('재밋엇어', 0.6850131750106812),
 ('재밌던', 0.6773744821548462),
 ('재밋네', 0.676638126373291),
 ('재밋', 0.673794150352478),
 ('꿀잼', 0.6584763526916504),
 ('재밌다', 0.6577560901641846),
 ('재밌고', 0.6382424831390381),
 ('재미있는', 0.6287941336631775)]

### OOV(Out of Vocabulary) 문제

In [19]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in model_sg_n10.wv.key_to_index

False

In [20]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model_sg_n10.wv['우주평화']

KeyError: "Key '우주평화' not present"

## 2. FastText 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/fasttext.html

In [21]:
# FastText 모델 생성 및 학습
# window=3, min_count=3, min_n=2, max_n=2
from gensim.models import FastText

ft_model = FastText(corpus, window=3, min_count=3, min_n=2, max_n=2, vector_size=100, sg=1, negative=10)

In [22]:
# 특정 단어와 유사한 단어 추출 : 이정재
ft_model.wv['이정재']

array([ 0.13708378,  0.30764937, -0.15965608, -0.2698747 , -0.04226875,
       -0.25096738, -0.04463465,  0.47608852,  0.34574524,  0.19094113,
        0.04722432, -0.17278592, -0.2767665 , -0.22276655, -0.15523256,
       -0.2834874 ,  0.16963762, -0.16402194,  0.41629967, -0.31753278,
        0.30333862, -0.28285235,  0.29092914, -0.2030386 , -0.01484248,
        0.0473625 , -0.01590259, -0.16221857, -0.4384505 , -0.39747345,
        0.07903232, -0.79252404, -0.09785138,  0.13841936, -0.24198692,
       -0.3872281 ,  0.54353267, -0.09739904, -0.16731016, -0.20034511,
       -0.4805368 ,  0.21866839, -0.3711434 , -0.14939706, -0.47583047,
       -0.0424248 , -0.2685072 , -0.47348064,  0.1360595 , -0.17123343,
        0.06419163,  0.02568406, -0.04599131,  0.14118192, -0.01177628,
        0.0976951 ,  0.2456667 ,  0.14290363, -0.23380432,  0.28686205,
        0.20939605, -0.14580765, -0.02359019,  0.47408342, -0.4562382 ,
       -0.45987687,  0.32690346,  0.05728964,  0.21540272, -0.07

In [23]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in model_sg_n10.wv.key_to_index

False

In [24]:
# corpus에 없는 단어의 임베딩 벡터 확인 
ft_model.wv['우주평화']

array([ 0.32174474,  0.45174137,  0.08929022,  0.25667778,  0.00226156,
        0.04569783, -0.13544318,  0.7795619 ,  0.22506213,  0.34754452,
       -0.10063794,  0.24980664, -0.00204272,  0.33194882, -0.13981727,
       -0.32554266, -0.1674215 , -0.2175651 ,  0.02366865, -0.11116955,
        0.11157451, -0.13412687, -0.01910002, -0.0707339 , -0.11726172,
        0.07448151, -0.49615225, -0.2757396 , -0.3868943 , -0.35709667,
       -0.01217   , -0.21547265,  0.07053582, -0.32219264, -0.23515525,
        0.17398258,  0.20779948,  0.36468774, -0.23442431, -0.09363831,
       -0.29907554, -0.02749879, -0.15076372,  0.14306039, -0.465671  ,
        0.04325005, -0.05301975,  0.1010979 , -0.06514391, -0.10004131,
        0.07391336,  0.28662544, -0.09332794, -0.13324013, -0.22315197,
       -0.15534155,  0.24439009,  0.4097786 , -0.12707497,  0.26891786,
        0.241804  ,  0.27323794, -0.40596947, -0.10155137, -0.26076978,
       -0.31891817,  0.13783519, -0.15455651,  0.21030259,  0.14

In [25]:
# corpus에 없는 단어와 유사한 단어추출 
ft_model.wv.most_similar('우주평화')

[('우주', 0.8226885795593262),
 ('평화', 0.8175219297409058),
 ('우주비행사', 0.7991693615913391),
 ('우주인', 0.7968124747276306),
 ('회색곰', 0.7867504954338074),
 ('우주선', 0.7830933928489685),
 ('전투복', 0.7828541994094849),
 ('븨', 0.7821066975593567),
 ('우방', 0.7814499735832214),
 ('켤', 0.775827944278717)]