# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from gensim.models.fasttext import FastText # build and train Fast Text model
from gensim.models import Word2Vec # to Load the saved model
from gensim.models.fasttext import load_facebook_model
from tabulate import tabulate

# Downloading pre-trained FastText model

In [None]:
# Download the pre-trained FastText word embeddings for English (300-dimensional vectors) from the Facebook AI repository
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

# Uncompress the downloaded file using gunzip, so that it can be used by the FastText library
! gunzip "cc.en.300.bin.gz"

# Downloading needed packages for Text Preprocessing

In [None]:
# Download the WordNet corpus from NLTK, saving it to the specified directory
nltk.download('wordnet', "/kaggle/working/nltk_data/")

# Download the OMW-1.4 corpus from NLTK, saving it to the specified directory
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")

# Unzip the WordNet corpus file, extracting it to the corpora directory
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora

# Unzip the OMW-1.4 corpus file, extracting it to the corpora directory
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora

# Add the custom NLTK data directory to the list of paths that NLTK searches for data
nltk.data.path.append("/kaggle/working/nltk_data/")

# Downloading English stopping words

In [None]:
# Create a set of English stopwords
en_stop = set(stopwords.words('english'))

# Extracting Text data from Yelp dataset

In [None]:
# Read in the Yelp dataset from a JSON file, specifying that each line is a separate JSON object
yelp_datafile = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json", lines=True)

# Print out the list of all column names in the dataset
print('List of all columns')
print(list(yelp_datafile))

# Subset the data to prepare it for training a gensim fastText model
# Select only the "text" column, which contains the text data we want to analyze
all_sentences = list(yelp_datafile['text'])

# Select a subset of the text data, taking only the first 1000 samples
part_of_sentences = all_sentences[0:1000]

# Print out a few examples of the sentences in the subset, to get an idea of what the data looks like
print("\nSamples of Sentences\n {}".format(part_of_sentences[0:10]))

# Defining a lemmatizer object to be used later

In [None]:
# Defining lemmatizer object
lemmatizer = WordNetLemmatizer()

# Function for Text Preprocessing

In [None]:
def process_text(review):
    review = re.sub(r'\s+', ' ', review, flags=re.I) # Remove extra white space from text

    review = re.sub(r'\W', ' ', str(review)) # Remove all the special characters from text

    review = re.sub(r'\s+[a-zA-Z]\s+', ' ', review) # Remove all single characters from text
    
    review = re.sub(r'[^a-zA-Z\s]', '', review) # Remove any character that isn't alphabetical

    review = review.lower() # Converting to Lowercase
    
    # Word tokenization 
    tokens = review.split()
    
    # Applying lemmatization
    lemma_txt = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Removing stopping words
    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
    
    # Drop words less than 3 characters
    tokens = [word for word in tokens if len(word) > 3]
    
    # Getting unique words
    indices = np.unique(tokens, return_index=True)[1]
    
    # Getting the original sorting of the unique words
    cleaned_unique_review = np.array(tokens)[np.sort(indices)].tolist()
    
    return cleaned_unique_review

In [None]:
cleaned_reviews = [ process_text(review) for review in part_of_sentences]

# Part of training data

In [None]:
print(cleaned_reviews[:10])

# Function to train the custom FastText and save it in disk

In [None]:
def train_Fasttext(sentences,embedding_size,window_size,min_word,down_sampling,Save_model_filename):
    fast_Text_model = FastText(sentences,
    vector_size=embedding_size, # Dimensionality of the word vectors. ,
    window=window_size,
    min_count=min_word, # The model ignores all words with total frequency lower than this.
    sample=down_sampling, # threshold which higher-frequency words are randomly down sampled
    workers = 4, # Num threads to train the model (faster training with multicore comp.)
    sg=1, # Training algorithm: skip-gram if sg=1, otherwise CBOW.
    epochs=100) # Number of iterations (epochs) over the corpus

    fast_Text_model.save(Save_model_filename) # Save fastText gensim model

In [None]:
# Define training parameters
embedding_size = 300  # Dimension of the word embeddings
window_size = 5  # Context window size
min_word = 1  # Minimum word count threshold
down_sampling = 1e-2  # Downsampling rate for frequent words

# Train the FastText model with the given parameters
train_Fasttext(cleaned_reviews, embedding_size, window_size, min_word, down_sampling, "Custom_FastText")

# Loading the saved custom model from disk

In [None]:
# Load saved gensim fastText model
fast_Text_model = Word2Vec.load("/kaggle/working/Custom_FastText") 

# Loading pretrained FastText 

In [None]:
# Load pretrained fastText word embeddings
pretrained_fastText_en = load_facebook_model('/kaggle/working/cc.en.300.bin')

# Getting the top n similar & dissimilar words for a particular word using custom & pre-trained FastText

In [None]:
# Get the list of words from the custom-trained model's vocabulary
words = list(fast_Text_model.wv.key_to_index)

# Iterate over the list of words, analyzing every 10th word (adjust this value for more or less frequent output)
for i in range(len(words)):
    if i % 10 == 0:
        print(f"Analyzing word: {words[i]}\n")

        # Get the top 10 similar words to the current word using the custom-trained model
        similar_words_custom = fast_Text_model.wv.most_similar(words[i], topn=10)

        # Get the top 10 dissimilar (opposite) words to the current word using the custom-trained model
        opposite_words_custom = fast_Text_model.wv.most_similar(negative=[words[i]], topn=10)

        # Get the top 10 similar words to the current word using the pre-trained English model
        similar_words_pretrained = pretrained_fastText_en.wv.most_similar(words[i], topn=10)

        # Get the top 10 dissimilar (opposite) words to the current word using the pre-trained English model
        opposite_words_pretrained = pretrained_fastText_en.wv.most_similar(negative=[words[i]], topn=10)

        # Create tables to display the results using the tabulate library
        table_custom_similar = tabulate(similar_words_custom, headers=['Similar Word', 'Similarity'], tablefmt='github')
        table_custom_opposite = tabulate(opposite_words_custom, headers=['Opposite Word', 'Similarity'], tablefmt='github')
        table_pretrained_similar = tabulate(similar_words_pretrained, headers=['Similar Word', 'Similarity'], tablefmt='github')
        table_pretrained_opposite = tabulate(opposite_words_pretrained, headers=['Opposite Word', 'Similarity'], tablefmt='github')

        # Print the tables to display the results
        print("Top 10 similar words (custom model):")
        print(table_custom_similar)
        print("\nTop 10 opposite words (custom model):")
        print(table_custom_opposite)
        print("\nTop 10 similar words (pre-trained model):")
        print(table_pretrained_similar)
        print("\nTop 10 opposite words (pre-trained model):")
        print(table_pretrained_opposite)
        print("\n" + "-"*40 + "\n")  # Separator for readability