In [3]:
!pip install fasttext



# libraries

In [4]:
import pandas as pd
from gensim.models.fasttext import FastText
import fasttext.util
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Load Data

In [5]:
tips = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json", lines=True)
tips_text = tips['text'].tolist()
subset_tips = tips_text[:1000]

# Preprocessing

In [6]:
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [re.sub(r'[^a-zA-Z\s]', ' ', token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 3]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Reconstruct the text
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [8]:
import nltk
import subprocess
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [9]:
tips_preprocessed = []

for text in subset_tips:
    preprocessed_text = preprocess(text)
    tokens = word_tokenize(preprocessed_text)
    tips_preprocessed.append(tokens)

In [164]:
# model = FastText(vector_size=100, window=5, min_count=5)
# model.build_vocab(corpus_iterable=tips_preprocessed)
# model.train(corpus_iterable=tips_preprocessed, total_examples=len(tips_preprocessed), epochs=10)
# def test(embedding, word):
#     try:
#         similar_words = embedding.wv.most_similar(word)
#         negative_words = embedding.wv.most_similar(negative=[word],topn=10)
#         return similar_words, negative_words
#     except KeyError:
#         return None, None

# Custom_Fasttext

In [10]:
def train_Fasttext(sentences,embedding_size,window_size,min_word,down_sampling,Save_model_filename):
    fast_Text_model = FastText(sentences,
    vector_size=embedding_size, 
    window=window_size,
    min_count=min_word, 
    sample=down_sampling, 
    workers = 4, 
    sg=1, 
    epochs=100) 

    fast_Text_model.save(Save_model_filename)

In [11]:

embedding_size = 300
window_size = 5
min_word = 5
down_sampling = 1e-2

train_Fasttext(tips_preprocessed,embedding_size,window_size,min_word,down_sampling,"Custom_FastText")

In [12]:
from gensim.models import Word2Vec

In [13]:
fast_Text_model = Word2Vec.load("/kaggle/working/Custom_FastText") 

In [169]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
! gunzip "cc.en.300.bin.gz"

--2024-04-21 07:22:09--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.128, 13.35.7.38, 13.35.7.82, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: 'cc.en.300.bin.gz'

cc.en.300.bin.gz      9%[>                   ] 393.96M  88.0MB/s    eta 50s    ^C
gzip: cc.en.300.bin already exists; do you wish to overwrite (y or n)? ^C


In [14]:
from gensim.models.fasttext import load_facebook_model

ft=load_facebook_model('/kaggle/working/cc.en.300.bin')

In [16]:
from tabulate import tabulate
words = list(fast_Text_model.wv.key_to_index)  

for i in range(len(words)):
    if i % 10 == 0: 
        print(f"Analyzing word: {words[i]}\n")
        
        
        similar_words_custom = fast_Text_model.wv.most_similar(words[i], topn=10)
        opposite_words_custom = fast_Text_model.wv.most_similar(negative=[words[i]], topn=10)
        similar_words_pretrained = ft.wv.most_similar(words[i], topn=10)
        opposite_words_pretrained = ft.wv.most_similar(negative=[words[i]], topn=10)
        print("Custom-trained FastText model results:")
        table_custom_similar = tabulate(similar_words_custom, headers=['Similar Word', 'Similarity'], tablefmt='github')
        table_custom_opposite = tabulate(opposite_words_custom, headers=['Opposite Word', 'Similarity'], tablefmt='github')
        table_pretrained_similar = tabulate(similar_words_pretrained, headers=['Similar Word', 'Similarity'], tablefmt='github')
        table_pretrained_opposite = tabulate(opposite_words_pretrained, headers=['Opposite Word', 'Similarity'], tablefmt='github')
        
        # Printing the tables
        print("Top 10 similar words (custom model):")
        print(table_custom_similar)
        print("\nTop 10 opposite words (custom model):")
        print(table_custom_opposite)
        print("\nTop 10 similar words (pre-trained model):")
        print(table_pretrained_similar)
        print("\nTop 10 opposite words (pre-trained model):")
        print(table_pretrained_opposite)
        print("\n" + "-"*40 + "\n")  # Separator for readability

Analyzing word: great

Custom-trained FastText model results:
Top 10 similar words (custom model):
| Similar Word   |   Similarity |
|----------------|--------------|
| eat            |     0.469364 |
| lot            |     0.45095  |
| highli         |     0.44481  |
| favorit        |     0.441522 |
| choic          |     0.433028 |
| pickl          |     0.415312 |
| fantast        |     0.410896 |
| server         |     0.390318 |
| burrito        |     0.387188 |
| breakfast      |     0.385408 |

Top 10 opposite words (custom model):
| Opposite Word   |   Similarity |
|-----------------|--------------|
| would           |   0.0539012  |
| check           |   0.0418079  |
| hand            |   0.0330255  |
| store           |   0.0326116  |
| yelp            |  -0.00583709 |
| sign            |  -0.0205808  |
| keep            |  -0.0224004  |
| next            |  -0.0244066  |
| go              |  -0.0322912  |
| late            |  -0.0329636  |

Top 10 similar words (pre-trained

In [None]:
# import random
# tokens = [re.sub(r'[^a-zA-Z\s]', ' ', token) for token in subset_tips]
# tokenized_tips = [nltk.word_tokenize(sentence) for sentence in tokens]
# tokenized_tips = [[token.lower() for token in sentence if token.isalpha() and len(token) > 3] for sentence in tokenized_tips]

# # Remove punctuation


# # Now you can randomly sample sentences from tokenized_tips
# sentences_to_sample = 10  # Number of sentences you want to sample
# random_sentences = random.sample(tokenized_tips, sentences_to_sample)


# fasttext_results = {}
# for sentence in random_sentences:
#     for word in sentence:
#         similar_words, negative_words = test(model, word)
#         fasttext_results[word] = (similar_words, negative_words)

In [None]:
# pretrained_results = {}
# for sentence in random_sentences:
#     for word in sentence:
#         similar_words = model.wv.most_similar_cosmul(word, topn=10)
#         negative_words = model.wv.most_similar_cosmul(positive=[word], negative=[word], topn=10)  
#         pretrained_results[word] = (similar_words, negative_words)

In [None]:
# pretrained_results = {}
# for sentence in random_sentences:
#     for word in sentence:
#         similar_words = model.wv.most_similar_cosmul(word, topn=10)
#         negative_words = model.wv.most_similar_cosmul(positive=['random'], negative=[word], topn=10)  # Change 'random' to any word not related to the context
#         pretrained_results[word] = (similar_words, negative_words)