# FastText

### gensim FastText

In [1]:
from gensim.models import FastText
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [None]:
# 데이터 오픈 및 전처리
f = open('ted_en.xml', 'r', encoding='utf-8')
xml = etree.parse(f)    # xml 파싱

contents = xml.xpath('//content/text()')
corpus = '\n'.join(contents)
corpus = re.sub(r'\([^)]*\)', '', corpus)

sentences = sent_tokenize(corpus)

preprocessed_sentences = []
en_stopwords = stopwords.words('english')

for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z0-9]', ' ', sentence)
    tokens = word_tokenize(sentence)
    tokens = [token for token in tokens if token not in en_stopwords]
    preprocessed_sentences.append(tokens)

In [3]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=preprocessed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=0
)

In [4]:
w2v_model.wv.vectors.shape

(21462, 100)

In [7]:
w2v_model.wv.most_similar('father')
# w2v_model.wv.most_similar('luckfather')    
# KeyError: "Key 'luckyfather' not present in vocabulary" == OOV 이슈

[('mother', 0.933303952217102),
 ('son', 0.9331583380699158),
 ('daughter', 0.9148882031440735),
 ('husband', 0.9036911725997925),
 ('sister', 0.8772664666175842),
 ('grandmother', 0.8598803877830505),
 ('grandfather', 0.8554475903511047),
 ('brother', 0.8532534837722778),
 ('woman', 0.8451687097549438),
 ('uncle', 0.8434945344924927)]

In [8]:
fasttext_model = FastText(
    sentences=preprocessed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=0
)

In [9]:
fasttext_model.wv.vectors.shape

(21462, 100)

In [None]:
fasttext_model.wv.most_similar('father')
fasttext_model.wv.most_similar('luckyfather')   # OOV 이슈 발생하지 않음

[('father', 0.9756500124931335),
 ('godfather', 0.953387439250946),
 ('grandfather', 0.9303913712501526),
 ('mother', 0.9258390069007874),
 ('grandmother', 0.9222261905670166),
 ('brother', 0.9178158044815063),
 ('luther', 0.9113094806671143),
 ('bother', 0.8874163031578064),
 ('slaughter', 0.8810012936592102),
 ('daughter', 0.8673279285430908)]

In [18]:
# OOV 단어도 subword 기반으로 검색해 vector 반환
fasttext_model.wv['luckyfather']
# fasttext_model.wv['father']

array([ 7.42126182e-02, -6.19854152e-01, -4.47496414e-01,  6.21159256e-01,
        2.65680254e-01,  7.94290662e-01, -7.26973712e-01, -4.75499213e-01,
        2.04840705e-01, -9.01607201e-02,  5.56952842e-02,  3.09769243e-01,
       -3.71722758e-01,  5.53401470e-01, -2.95425683e-01, -2.59065539e-01,
        6.91350773e-02,  2.96072483e-01, -2.65974998e-01, -2.54933648e-02,
       -7.96571016e-01, -7.48857036e-02,  1.63403079e-01, -7.81512141e-01,
       -4.23338324e-01,  4.07506339e-02, -2.90984452e-01, -2.59336859e-01,
        1.44652069e-01, -5.05605221e-01,  2.41197087e-03, -5.74520528e-01,
        5.92765808e-01, -6.34025596e-03,  3.23237896e-01,  6.04807854e-01,
       -1.71439275e-02,  3.02604437e-01,  4.18252081e-01,  2.02477261e-01,
       -1.60819232e-01,  9.57527012e-02,  2.41008863e-01,  4.25204495e-03,
        1.14833534e-01,  8.42388272e-01,  1.40291885e-01, -6.61330402e-01,
        3.73102039e-01,  6.12764001e-01,  3.81614966e-03,  1.89044178e-01,
        4.05599743e-01, -

### fasttext 패키지 활용

In [13]:
!pip install fasttext-wheel

Collecting fasttext-wheel
  Downloading fasttext_wheel-0.9.2-cp312-cp312-win_amd64.whl.metadata (16 kB)
Collecting pybind11>=2.2 (from fasttext-wheel)
  Downloading pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Downloading fasttext_wheel-0.9.2-cp312-cp312-win_amd64.whl (234 kB)
Downloading pybind11-3.0.0-py3-none-any.whl (292 kB)
Installing collected packages: pybind11, fasttext-wheel

   ---------------------------------------- 0/2 [pybind11]
   ---------------------------------------- 0/2 [pybind11]
   -------------------- ------------------- 1/2 [fasttext-wheel]
   ---------------------------------------- 2/2 [fasttext-wheel]

Successfully installed fasttext-wheel-0.9.2 pybind11-3.0.0


In [14]:
import fasttext

model = fasttext.train_unsupervised(
    'naver_movie_ratings.txt',
    model='skipgram',
    minCount=1,
    dim=100,
    minn=3,     # subword 최소 ngram
    maxn=5      # subword 최대 ngram
)

In [15]:
model.get_word_vector('극장')

array([ 0.51624244, -0.4024299 , -0.5206563 ,  0.8314084 , -0.25989515,
        0.03441481, -0.1615962 , -0.01060329,  0.12076107,  0.5808947 ,
        0.04603357,  0.46977884,  0.6068551 , -0.38944808, -0.7193219 ,
       -1.017021  ,  0.04511445, -1.4451909 , -0.44483677, -0.22945254,
        0.48974344,  0.35077262,  0.2873878 , -0.06778642,  0.6031203 ,
       -0.9017893 , -0.20970915,  0.5355912 , -0.8157709 ,  0.35741293,
       -0.21840473,  0.41561413, -0.02705621,  0.32620203, -0.5997496 ,
        0.24734351,  0.03415428,  0.31736535, -0.39581585, -0.26159528,
        0.08088437,  0.31370527,  0.37601194, -0.27724096,  1.1015292 ,
       -0.1814245 ,  0.1792432 , -0.47298533, -0.0979843 ,  0.2804461 ,
        0.25746885, -0.8269444 , -0.09075226,  0.4636799 , -0.6466367 ,
        0.15288693,  0.69125354,  0.31309482,  1.0832394 , -0.09035446,
        0.4584078 ,  0.00728013,  0.2042484 ,  0.40845534, -0.04848186,
       -0.08818679, -0.08709148,  0.00172707,  0.3914883 , -0.01

In [16]:
model.get_subwords('영화관')

(['영화관', '<영화', '<영화관', '<영화관>', '영화관', '영화관>', '화관>'],
 array([   2062, 1921845, 1442415, 1378913, 2245977, 1515139, 1352938]))