In [10]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

--2025-04-13 08:32:34--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.239.50.18, 18.239.50.9, 18.239.50.120, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.239.50.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-04-13 08:32:55 (210 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [11]:
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import FastText
from gensim.models.fasttext import load_facebook_model
import pandas as pd

nltk.download('stopwords')
nltk.download('wordnet')

en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def process_text(document):
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'\W', ' ', str(document))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = document.lower()
    tokens = document.split()
    tokens = [word for word in tokens if len(word) > 3]
    lemma_txt = [lemmatizer.lemmatize(word) for word in tokens]
    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
    clean_txt = ' '.join(lemma_no_stop_txt)
    return clean_txt

In [13]:
def load_yelp_data(filepath):
    return pd.read_json(filepath, lines=True)

In [14]:
def prepare_data_for_training(df):
    df['clean_text'] = df['text'].apply(process_text)
    return df['clean_text'].tolist()

In [15]:
def train_Fasttext(word_tokens, embedding_size, window_size, min_word, down_sampling, save_model_filename):
    model = FastText(
        word_tokens,
        vector_size=embedding_size,
        window=window_size,
        min_count=min_word,
        sample=down_sampling,
        workers=4,
        sg=1,
        epochs=10
    )
    model.save(save_model_filename)
    return model

In [16]:
def test_model(model, test_word):
    print(f"\nEmbedding for '{test_word}':\n", model.wv[test_word])
    print(f"\nTop 10 similar to '{test_word}':")
    print(model.wv.most_similar(test_word, topn=10))
    print(f"\nTop 10 opposite to '{test_word}':")
    print(model.wv.most_similar(negative=[test_word], topn=10))


In [17]:
def update_fasttext_model(model, new_data):
    model.build_vocab(new_data, update=True)
    model.train(new_data, total_examples=model.corpus_count, epochs=10)

In [18]:
def load_pretrained_fasttext(model_path):
    return load_facebook_model(model_path)

In [19]:
data_file_path = "/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json"
yelp_data = load_yelp_data(data_file_path)
prepared_data = prepare_data_for_training(yelp_data)
word_tokens = [sentence.split() for sentence in prepared_data]

embedding_size = 300
window_size = 5
min_word = 5
down_sampling = 1e-2
save_model_filename = "custom_fasttext_model.model"

fasttext_model = train_Fasttext(word_tokens, embedding_size, window_size, min_word, down_sampling, save_model_filename)

test_word = "good"
test_model(fasttext_model, test_word)

new_data = [['aaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff'],
            ['wwww', "xxxx", "yyyy", "zzzz", "vvvv", "mmmm"]]
update_fasttext_model(fasttext_model, new_data)

pretrained_model_path = "cc.en.300.bin.gz"
pretrained_fasttext_en = load_pretrained_fasttext(pretrained_model_path)
test_model(pretrained_fasttext_en, test_word)
update_fasttext_model(pretrained_fasttext_en, new_data)
test_model(pretrained_fasttext_en, test_word)


Embedding for 'good':
 [-0.13433287  0.1345744   0.10928126  0.03601372 -0.22981864 -0.1878953
  0.00411495  0.15122561  0.08133134 -0.04683917  0.07295131 -0.08175544
 -0.13863546  0.09970253 -0.05060301 -0.02424406  0.00797014 -0.03449253
 -0.03939734  0.04164088 -0.05816166  0.14999034  0.01899347 -0.40774056
  0.08540488  0.0052816   0.11912549  0.01046726 -0.14788279 -0.02006248
 -0.01815259 -0.1711259  -0.06239028  0.07748819  0.1306847  -0.16589648
  0.070183   -0.16845682 -0.05021591 -0.05104046  0.23071332  0.00093354
  0.01798169 -0.12545395  0.04640459 -0.06585651 -0.08490772  0.02529917
 -0.05633808 -0.17074384  0.04213131  0.12303939  0.09960306  0.1748239
 -0.10082248 -0.00106067 -0.13588957 -0.20499851 -0.20082498  0.08354304
  0.07337298  0.24567266 -0.10857672 -0.01721917  0.18567075 -0.0218837
 -0.06416624 -0.06317465 -0.16118535 -0.1670803   0.07015943 -0.22043289
 -0.12041414 -0.08078477 -0.06983704 -0.08346073  0.0340683   0.08976562
 -0.07670803  0.32961068 -0.16