# Libraries

In [1]:
import pandas as pd 
# preprocessing
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data

In [2]:
file_path = r"/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json"
df = pd.read_json(file_path, lines=True)
df = df[:1000]

text_column = df['text']
text_column.head()

0                       Avengers time with the ladies.
1    They have lots of good deserts and tasty cuban...
2               It's open even when you think it isn't
3                            Very decent fried chicken
4               Appetizers.. platter special for lunch
Name: text, dtype: object

In [3]:
df.shape

(1000, 5)

# Preprocessing

In [4]:
def preprocess(text: str) -> list :
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)       # Remove all non-alphabetic characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)   # Remove all single characters

    tokens = text.split()
    tokens = [t for t in tokens if len(t)>3]      # Keep words with length >= 3

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer() 

    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return tokens if tokens else []               # Return an empty list if nothing remains

sentences = [preprocess(t) for t in text_column] # List[List[str]]
print(sentences[:3])
len(sentences)

[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ['open', 'even', 'think', 'isnt']]


1000

# Official FastText

In [5]:
from gensim.models import FastText

In [6]:
# Train FastText model
FastText_model = FastText(
    sentences=sentences,
    vector_size=100,
    window=3,
    min_count=1,
    epochs=500
)
print(FastText_model)
# Save the model
FastText_model.save("fasttext_model.model")
print("model saved.")

FastText<vocab=2121, vector_size=100, alpha=0.025>
model saved.


In [7]:
vocab_size = len(FastText_model.wv)
embedding_size = FastText_model.vector_size

# Print vocabulary and embedding size
print(f"Vocabulary Size: {vocab_size}")
print(f"Embedding Size: {embedding_size}")

Vocabulary Size: 2121
Embedding Size: 100


In [8]:
similar_words = FastText_model.wv.most_similar('good', topn=10)
print("\nSimilar")
print(similar_words)
print("-"*30) 
opposite_words = FastText_model.wv.most_similar(negative= 'good', topn=10)
print("\n", opposite_words)


Similar
[('goodi', 0.8452088832855225), ('food', 0.5561390519142151), ('deliciously', 0.5237860083580017), ('neighborhood', 0.4961780607700348), ('deliciousness', 0.4901498854160309), ('deliciousthen', 0.4899592697620392), ('ipod', 0.4890640377998352), ('foodgreat', 0.47072115540504456), ('delicious', 0.46029138565063477), ('bollywood', 0.4556906819343567)]
------------------------------

 [('postage', 0.3698478043079376), ('hermitage', 0.3362504243850708), ('cinco', 0.3282019793987274), ('lurk', 0.3278462290763855), ('professionalism', 0.31859180331230164), ('postal', 0.31132882833480835), ('prepaid', 0.30549004673957825), ('trip', 0.30125343799591064), ('trap', 0.2973553240299225), ('ease', 0.2962052822113037)]


# pretrained FastText model

In [9]:
# Download Model
import urllib.request
import gzip
import os
import shutil

In [10]:
# Download pretrained FastText model
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz"
output_gz = "cc.en.300.bin.gz"
output_bin = "cc.en.300.bin"

# Download the .gz file
print("Downloading pretrained FastText model...")
urllib.request.urlretrieve(url, output_gz)

# Unzip the .gz file
print("Unzipping the model...")
with gzip.open(output_gz, 'rb') as f_in:
    with open(output_bin, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
print("model saved")
# Remove the .gz file to save space
os.remove(output_gz)

Downloading pretrained FastText model...
Unzipping the model...
model saved


In [11]:
import fasttext

In [12]:
print("Loading the model...")
pretrained = fasttext.load_model(output_bin)
print(pretrained)
vocab_size = len(pretrained.words)
embedding_size = pretrained.get_dimension()
print(f"Vocabulary Size: {vocab_size}")
print(f"Embedding Size: {embedding_size}")
# --------------------
similar_words = pretrained.get_nearest_neighbors("good", k=10)
print("similar words",similar_words)
opposite_words = pretrained.get_nearest_neighbors(negative=["learning"], k=10)
print("opposite words", opposite_words)

Loading the model...
<fasttext.FastText._FastText object at 0x7c722f235f90>
Vocabulary Size: 2000000
Embedding Size: 300
similar words [(0.7517593502998352, 'bad'), (0.7426098585128784, 'great'), (0.7299689054489136, 'decent'), (0.7123614549636841, 'nice'), (0.6796907186508179, 'Good'), (0.6737031936645508, 'excellent'), (0.669592022895813, 'goood'), (0.6602178812026978, 'ggod'), (0.6479219794273376, 'semi-good'), (0.6417751908302307, 'good.Good')]


TypeError: _FastText.get_nearest_neighbors() got an unexpected keyword argument 'negative'

- import fasttext
    - Facebook's original FastText package.
    - Faster and more memory efficient
    - Limited API (e.g., doesn't support negative sampling like Gensim does).
    - get_nearest_neighbors(negative) doesn’t exist in official fasttext
    - Used in real Prijects (Production)

- Gensim FastText
    - You can use: (positive, negative, most_similar, similarity, .....)
    - Slightly slower
    - For production embedding lookup, not as efficient as the original FastText.

In [13]:
from gensim.models.fasttext import load_facebook_model

pretrained = load_facebook_model("cc.en.300.bin")
# model = load_facebook_model(output_bin)

In [14]:
vocab_size = len(pretrained.wv)
embedding_size = pretrained.wv.vector_size
print(f"Vocabulary Size: {vocab_size}")
print(f"Embedding Size: {embedding_size}")
similar = pretrained.wv.most_similar("learning", topn=10)
print("similar words :", similar)

opposite_words = pretrained.wv.most_similar(negative=["learning"],topn=10)
print("\n\nopposite words :", opposite_words)

Vocabulary Size: 2000000
Embedding Size: 300
similar words : [('learing', 0.7456762194633484), ('Learning', 0.6895480751991272), ('learning.This', 0.687819242477417), ('learning.The', 0.6796228289604187), ('learning.It', 0.6753032207489014), ('learning.So', 0.6706693768501282), ('learning.What', 0.6673311591148376), ('learning.But', 0.6648256778717041), ('learning-', 0.6643092036247253), ('learning.As', 0.6633589267730713)]


opposite words : [('19555', 0.2533474564552307), ('12291', 0.23999808728694916), ('10264', 0.2394980639219284), ('13107', 0.23354505002498627), ('8504', 0.23330195248126984), ('13223', 0.23251304030418396), ('7242', 0.23047803342342377), ('13466', 0.2299567013978958), ('10494', 0.22803275287151337), ('14138', 0.2278987020254135)]


In [15]:
pretrained.build_vocab(sentences, update=True)
pretrained.train(
    sentences,
    total_examples=len(sentences),
    epochs=10
)
# Print vocabulary and embedding size
vocab_size = len(pretrained.wv)
embedding_size = pretrained.vector_size
print(f"Vocabulary Size: {vocab_size}")
print(f"Embedding Size: {embedding_size}")

Vocabulary Size: 2000000
Embedding Size: 300


In [16]:
similar_words = pretrained.wv.most_similar("learn", topn=10)
opposite_words = pretrained.wv.most_similar(negative="learn", topn=10)
print(similar_words, "\n\n", opposite_words)

[('teach', 0.716772198677063), ('Learn', 0.7041028738021851), ('learned', 0.6968039274215698), ('learm', 0.6521831750869751), ('re-learn', 0.6518067717552185), ('discover', 0.6409897208213806), ('learn.If', 0.6341798901557922), ('relearn', 0.6159347295761108), ('leanr', 0.6142886877059937), ('understand', 0.6114104390144348)] 

 [('.Rear', 0.22274798154830933), ('3.825', 0.20031915605068207), ('1.638', 0.19616979360580444), ('W52', 0.19612562656402588), ('3.725', 0.19571073353290558), ('9,677', 0.1925133764743805), ('2.101', 0.19243070483207703), ('2.675', 0.1889045089483261), ('3.425', 0.1883799433708191), ('2.76m', 0.1873113363981247)]
