In [1]:
! pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25ldone
[?25h  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40703 sha256=5e74d16fe754bef9484764bf8e0fffeaa4aea45d6d01926aac58487a536fd071
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


# Import Libraries

In [2]:
import numpy as np
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import subprocess
import zipfile
import random
import string
import fasttext.util
import fasttext
from gensim.models import FastText
from tabulate import tabulate
from gensim.models.fasttext import load_facebook_model
from fpdf import FPDF
import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Check if WordNet is available, if not, download it and extract
try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    with zipfile.ZipFile('/kaggle/working/corpora/wordnet.zip', 'r') as zip_ref:
        zip_ref.extractall('/kaggle/working/corpora')
    nltk.data.path.append('/kaggle/working/')

# Now you can import wordnet from nltk.corpus
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /kaggle/working/...


# Read json file

In [5]:
file_path="/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json"
tips_data = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        tips_data.append(json.loads(line))

In [6]:
# sample=tips_data[0]  
# text=sample["text"]
# text

In [7]:
text = []
for tip in tips_data:
    #  I  Extract the "text" field from the tip and append it to the text list
    text.append(tip['text'])

In [8]:
samples=text[:100]
samples[:5]

['Avengers time with the ladies.',
 'They have lots of good deserts and tasty cuban sandwiches',
 "It's open even when you think it isn't",
 'Very decent fried chicken',
 'Appetizers.. platter special for lunch']

# Text Processing

In [9]:
                            # Tokenization
tokenized_text = [word_tokenize(text) for text in samples]

                            # Remove stop words
stop_words = set(stopwords.words('english'))
filtered_text = []
for tokens in tokenized_text:
    filtered_text.append([word for word in tokens if word.lower() not in stop_words])

                            # Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = []
for tokens in filtered_text:                                                                   # Remove punctuation
    lemmatized_text.append([lemmatizer.lemmatize(word.lower()) for word in tokens if word not in string.punctuation])

                            # Flatten the list of lists
preprocessed_text = [word for sublist in lemmatized_text for word in sublist]

In [10]:
len(preprocessed_text)

601

In [11]:
preprocessed_text[:10]

['avenger',
 'time',
 'lady',
 'lot',
 'good',
 'desert',
 'tasty',
 'cuban',
 'sandwich',
 "'s"]

# FastText model using (Gensim)

In [12]:
# Train the FastText model
model = FastText(sentences=[preprocessed_text], vector_size=100, window=5, min_count=1, workers=4, sg=1)

                              # save the trained model
model.save("fasttext_model")

# model = FastText.load("fasttext_model")

## Note:
* ## min_count specifies the minimum count of words to consider when training the model.
* ## workers specifies the number of worker threads to train the model.
* ## sg is the training algorithm. sg=1 indicates skip-gram, while sg=0 indicates CBOW (Continuous Bag of Words).

### Select 10 random samples

In [13]:
# Select 10 random words from the preprocessed text
random_words = random.sample(preprocessed_text, 10)
random_words

['card',
 'shuttle',
 'anymore',
 'sometimes',
 'l',
 'last',
 'rally',
 'cheeseburger',
 'ring',
 'inside']

# find similar and dissimilar words gensim function

In [14]:
def find_similar_and_dissimilar_words_gensim(word, model):
    try:
        # Find top 10 most similar words
        similar_words = model.wv.most_similar(positive=[word], topn=10)

        # Find top 10 most dissimilar words (opposite words)
        dissimilar_words = model.wv.most_similar(negative=[word], topn=10)

        return similar_words, dissimilar_words
    except KeyError:
        # Handle the case if the word is not in the vocabulary
        return [], []

# Iterate over each random word and find similar and dissimilar words
for word in random_words:
    similar_words, dissimilar_words = find_similar_and_dissimilar_words_gensim(word, model)

    if similar_words:
        # Tabulate the similar words
        table_similar = tabulate(similar_words, headers=['Similar Word', 'Similarity'], tablefmt='github')
        print(f"\nTop 10 Similar Words for '{word}':")
        print(table_similar)

    if dissimilar_words:
        # Tabulate the dissimilar (opposite) words
        table_dissimilar = tabulate(dissimilar_words, headers=['Dissimilar Word', 'Similarity'], tablefmt='github')
        print(f"\nTop 10 Dissimilar Words for '{word}':")
        print(table_dissimilar)

    print("\n" + "-" * 40 + "\n")



Top 10 Similar Words for 'card':
| Similar Word   |   Similarity |
|----------------|--------------|
| carl           |     0.395454 |
| restaurant     |     0.36008  |
| orleans        |     0.312124 |
| new            |     0.300653 |
| many           |     0.291457 |
| nice           |     0.277932 |
| breakfast      |     0.276724 |
| chalkboard     |     0.259243 |
| starter        |     0.257331 |
| cheese         |     0.250604 |

Top 10 Dissimilar Words for 'card':
| Dissimilar Word   |   Similarity |
|-------------------|--------------|
| incorrect         |     0.19468  |
| easy              |     0.179817 |
| hahaha            |     0.15579  |
| bank              |     0.155541 |
| friend            |     0.155388 |
| 2013              |     0.152642 |
| delicious         |     0.143214 |
| time              |     0.13968  |
| dont              |     0.136974 |
| ugh               |     0.136965 |

----------------------------------------


Top 10 Similar Words for 'shuttle

# pre-trained FastText

In [15]:
# Download the pre-trained FastText word embeddings for English (300-dimensional vectors) from the Facebook AI repository
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

# Uncompress the downloaded file using gunzip, so that it can be used by the FastText library
! gunzip "cc.en.300.bin.gz"

--2024-10-17 14:50:15--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.51, 3.163.189.14, 3.163.189.96, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: 'cc.en.300.bin.gz'


2024-10-17 14:50:30 (288 MB/s) - 'cc.en.300.bin.gz' saved [4503593528/4503593528]



In [16]:
# Load pretrained fastText word embeddings
pretrained_fastText_en = load_facebook_model('/kaggle/working/cc.en.300.bin')

# Generate PDF

In [17]:
# Example: Load pretrained fastText word embeddings (Uncomment for actual use)
# pretrained_fastText_en = load_facebook_model('/kaggle/working/cc.en.300.bin')

class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Word Similarity and Opposite Analysis', 0, 1, 'C')

    def chapter_title(self, word):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, f'Analyzing word: {word}', 0, 1, 'L')

    def chapter_body(self, title, table):
        self.set_font('Arial', '', 12)
        # Encode content in utf-8 to handle special characters
        title_utf8 = title.encode('latin-1', 'replace').decode('latin-1')
        table_utf8 = table.encode('latin-1', 'replace').decode('latin-1')
        self.multi_cell(0, 10, f'{title_utf8}\n{table_utf8}')
        self.ln()
        
    def print_chapter(self, word, tables):
        self.add_page()
        self.chapter_title(word)
        for title, table in tables.items():
            self.chapter_body(title, table)

def analyze_word(word, model, model_name="custom model"):
    try:
        # Get top 10 similar words
        similar_words = model.wv.most_similar(word, topn=10)

        # Get top 10 opposite words
        opposite_words = model.wv.most_similar(negative=[word], topn=10)

        # Create tables to display the results
        table_similar = tabulate(similar_words, headers=['Similar Word', 'Similarity'], tablefmt='github')
        table_opposite = tabulate(opposite_words, headers=['Opposite Word', 'Similarity'], tablefmt='github')

        # Print the results in a formatted way
        print(f"Analyzing word: {word}")
        print(f"\nTop 10 similar words ({model_name}):")
        print(table_similar)
        print(f"\nTop 10 opposite words ({model_name}):")
        print(table_opposite)
        print("\n" + "-"*40 + "\n")

        return {"Top 10 similar words": table_similar, "Top 10 opposite words": table_opposite}
    except KeyError:
        print(f"The word '{word}' is not in the model vocabulary.")
        return {}

def save_all_to_single_pdf(word_analysis, pdf_filename="word_similarity_analysis.pdf"):
    # Initialize the custom PDF class
    pdf = PDF()
    
    # Add the analysis for each word to the PDF
    for word, tables in word_analysis.items():
        pdf.print_chapter(word, tables)
    
    # Output the final PDF
    pdf.output(pdf_filename)

# Example list of words to analyze (Uncomment for actual use)
# random_words = ['silver', 'king', 'apple']    Simple Example

# Dictionary to store analysis for all words
word_analysis = {}

# Iterate over each word and find similar and dissimilar words
for word in random_words:
    # Analyze the word for the custom model
    tables = analyze_word(word, pretrained_fastText_en, model_name="pretrained fastText model")

    if tables:  # Check if the analysis was successful (i.e., the word was found)
        word_analysis[word] = tables  # Store the analysis in the dictionary

# Save all analyses to a single PDF
save_all_to_single_pdf(word_analysis, pdf_filename="all_word_analysis.pdf")

Analyzing word: card

Top 10 similar words (pretrained fastText model):
| Similar Word   |   Similarity |
|----------------|--------------|
| cards          |     0.839138 |
| card.The       |     0.794356 |
| card.This      |     0.784574 |
| card.I         |     0.781328 |
| card.It        |     0.781021 |
| card.So        |     0.777752 |
| card.Now       |     0.77112  |
| card.As        |     0.761264 |
| card.What      |     0.759149 |
| card.          |     0.758997 |

Top 10 opposite words (pretrained fastText model):
| Opposite Word   |   Similarity |
|-----------------|--------------|
| Coalbed         |     0.198198 |
| Zhoukoudian     |     0.195917 |
| Karangasem      |     0.195446 |
| Navallur        |     0.18812  |
| Median          |     0.187289 |
| Buras           |     0.18142  |
| Androctonus     |     0.181393 |
| Mengwi          |     0.181197 |
| EditAttach      |     0.180899 |
| 00-05           |     0.175801 |

----------------------------------------

Analy