In [8]:
import nltk
import numpy as np
from gensim.models import Word2Vec, FastText
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to C:\Users\MSI
[nltk_data]     GF66\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\MSI
[nltk_data]     GF66\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Data Preparation

In [10]:
sentences = brown.sents()

sentences = [[word.lower() for word in sent] for sent in sentences]

print(len(sentences))
print(sentences[:5])

57340
[['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['the', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'city', 'executive', 'committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['the', 'september-october', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'mayor-nominate', 'ivan', 'allen', 'jr.', '.'], ['``', 'only', 'a', 'relative', 'handful', 'of', 'such'

In [12]:
sentences_clean = [
    [word for word in sent if word.isalpha()]
    for sent in sentences
]

print(sentences_clean[:5])

[['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place'], ['the', 'jury', 'further', 'said', 'in', 'presentments', 'that', 'the', 'city', 'executive', 'committee', 'which', 'had', 'charge', 'of', 'the', 'election', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted'], ['the', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', 'irregularities', 'in', 'the', 'primary', 'which', 'was', 'won', 'by', 'ivan', 'allen'], ['only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', 'the', 'jury', 'said', 'considering', 'the', 'widespread', 'interest', 'in', 'the', 'election', 'the', 'number', 'of', 'voters', 'and', 'th

### Test sentences

In [14]:
additional_sentences = [
    # Group 1: Morphologically Related Words
    ["the", "teacher", "is", "teaching", "students", "about", "grammar"],
    ["she", "teaches", "mathematics", "at", "the", "university"],
    ["the", "teacher", "prepared", "teaching", "materials", "yesterday"],
    ["many", "teachers", "attend", "teaching", "conferences", "annually"],
    ["effective", "teaching", "requires", "good", "communication", "skills"],

    # Group 2: Rare Morphological Variant
    ["the", "unteachable", "student", "refused", "to", "learn"],

    # Group 3: Compound and Derived Words
    ["computational", "linguistics", "combines", "computer", "science", "and", "language"],
    ["the", "computation", "took", "several", "hours", "to", "complete"],
    ["we", "computed", "the", "results", "using", "advanced", "algorithms"],
    ["modern", "computers", "can", "compute", "complex", "calculations", "quickly"],

    # Group 4: Rare Compound
    ["the", "recomputation", "was", "necessary", "after", "finding", "errors"],

    # Group 5: Another Morphological Family
    ["natural", "language", "processing", "is", "fascinating"],
    ["the", "nature", "of", "language", "is", "complex"],
    ["naturally", "occurring", "patterns", "in", "text", "are", "important"],

    # Group 6: Rare Morphological Variant
    ["the", "unnaturalness", "of", "the", "translation", "was", "obvious"]
]


sentences.extend(additional_sentences)

print(len(sentences))
print(sentences[-1])

57355
['the', 'unnaturalness', 'of', 'the', 'translation', 'was', 'obvious']


### Models Training

In [16]:
w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    workers=4
)

w2v_model.train(sentences, total_examples=len(sentences), epochs=10)

w2v_model.save("word2vec.model")

In [18]:
ft_model = FastText(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    min_n=3,
    max_n=6,
    workers=4
)

ft_model.train(sentences, total_examples=len(sentences), epochs=10)

ft_model.save("fasttext.model")

### Experiment 1:  Out-of-Vocabulary (OOV) Problem

In [23]:
OOV_words = ["teachable", "unteachable", "supercomputer", "miscomputation", "unnaturally"]

for word in OOV_words:
    w2v_vector = w2v_model.wv[word]
    print(w2v_vector.shape)
    print(w2v_vector[:20])

KeyError: "Key 'teachable' not present"

In [25]:
for word in OOV_words:
    ft_vector = ft_model.wv[word]
    print(ft_vector.shape)
    print(ft_vector[:10])

(100,)
[-0.20627715  0.21301503 -0.26031998 -0.51886934  0.3548757   0.1640053
  0.09596927  0.76987135  0.01443066 -0.22707275]
(100,)
[-0.29954165  0.2726649  -0.18479684 -0.22918549  0.25126967  0.04493514
 -0.09066705  0.5907475  -0.14363585 -0.27400377]
(100,)
[ 0.14826566  0.001673    0.02298766 -0.22285356  0.15738691 -0.03291582
  0.09838043  0.32132804 -0.20684262 -0.18740721]
(100,)
[-0.0323016   0.17304905 -0.06469271 -0.19970077  0.06776691  0.01055942
  0.3540716   0.17987329 -0.16240783 -0.59349054]
(100,)
[-0.49514696 -0.13502608  0.25135782  0.01561346  0.20579422 -0.10073999
  0.00263909  0.18381196  0.11275619 -0.47002557]


### Experiment 2: Rare Words Quality

In [28]:
rare_words = ["unteachable", "recomputation", "unnaturalness"]
common_words = ["teacher", "computation", "natural"]

for word1, word2 in zip(rare_words, common_words):
    wv_similarity = w2v_model.wv.similarity(word1, word2)
    print(f"Word2Vec similarity of {word1} and {word2}", wv_similarity)

    ft_similarity = ft_model.wv.similarity(word1, word2)
    print(f"FastText similarity of {word1} and {word2}", ft_similarity)

    if ft_similarity > wv_similarity:
        print("Fast text shows stronger similarity")
    else:
        print("Word2Vec shows stronger similarity")


Word2Vec similarity of unteachable and teacher 0.49298295
FastText similarity of unteachable and teacher 0.5979202
Fast text shows stronger similarity
Word2Vec similarity of recomputation and computation 0.6526901
FastText similarity of recomputation and computation 0.9588416
Fast text shows stronger similarity
Word2Vec similarity of unnaturalness and natural 0.54034305
FastText similarity of unnaturalness and natural 0.7457698
Fast text shows stronger similarity


### Experiment 3: Morphological Relationships

In [39]:
morph_rel = ["teaching", "teach", "computation", "compute"]

for word in morph_rel:
    wv = w2v_model.wv.most_similar(word, topn=5)
    ft = ft_model.wv.most_similar(word, topn=5)
    print(f"Top 5 most similar words to '{word}':")
    print("Word2Vec: ", wv)
    print("Fast text: ", ft)
    print(f"{'_'*70}")
    print()


Top 5 most similar words to 'teaching':
Word2Vec:  [('assimilation', 0.7218841910362244), ('marketable', 0.7090026140213013), ('elementary-school', 0.6986861824989319), ('compulsivity', 0.6984511613845825), ('fabrication', 0.6897518038749695)]
Fast text:  [('teachings', 0.8875789642333984), ('aching', 0.8830503225326538), ('out-reaching', 0.8653694987297058), ('sky-reaching', 0.8644314408302307), ('beaching', 0.860158383846283)]
______________________________________________________________________

Top 5 most similar words to 'teach':
Word2Vec:  [('translate', 0.7983649373054504), ('librarian', 0.7759448885917664), ('approve', 0.7646365761756897), ('recruit', 0.7585350275039673), ('disturb', 0.7575100660324097)]
Fast text:  [('teacher', 0.8378256559371948), ('unteach', 0.8233897089958191), ("teachers'", 0.802670419216156), ('teaches', 0.8021780252456665), ('teachings', 0.7629696130752563)]
______________________________________________________________________

Top 5 most similar words

### Experiment 4: Word Analogies

In [45]:
positive_words = [["teaching", "computer"], ["naturally", "quick"]]
negative_words = ["teacher", "natural"]

for i in range(len(negative_words)):
    print(f"{positive_words[i][0]} : {positive_words[i][1]} = {negative_words[i]} : ? ")
    print("Word2Vec morphological analogies")
    test = w2v_model.wv.most_similar(positive=positive_words[i],
                                     negative=negative_words[i],
                                     topn=5)
    print(test)

    print("Fast text morphological analogies")
    test = ft_model.wv.most_similar(positive=positive_words[i],
                                    negative=negative_words[i],
                                    topn=5)
    print(test)



teaching : computer = teacher : ? 
Word2Vec morphological analogies
[('lookup', 0.7337884306907654), ('manifold', 0.7054128050804138), ('ultraviolet', 0.6857830286026001), ('developmental', 0.6831170916557312), ('glottochronological', 0.6811620593070984)]
Fast text morphological analogies
[('computing', 0.8557388782501221), ('sampling', 0.746216893196106), ('compiling', 0.7445888519287109), ('amplifying', 0.7397489547729492), ('multiplying', 0.7387749552726746)]
naturally : quick = natural : ? 
Word2Vec morphological analogies
[('careful', 0.5796281099319458), ('cognac', 0.5438370704650879), ('dubious', 0.5359185338020325), ('dumb', 0.5358986258506775), ('trig', 0.5355992317199707)]
Fast text morphological analogies
[('quickly', 0.8022687435150146), ('quickie', 0.7499027252197266), ('awfully', 0.7093846201896667), ('quickstep', 0.7070898413658142), ('bully', 0.704820990562439)]
