In [1]:
#!/usr/bin/env python
# coding: utf-8
#===============================================================================
#
#           FILE: word2vec_ntb.py 
#         AUTHOR: Bianca Ciobanica
#	       EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.11.4
#        CREATED: 16-11-2023 
#
#===============================================================================
#    DESCRIPTION:  sources used : 
#
#    
#          USAGE: 
#===============================================================================

In [2]:
import re
import time
import nltk
from collections import Counter
from itertools import chain
from nltk.corpus.reader import PlaintextCorpusReader
import gensim
from gensim import corpora
from gensim.models import Word2Vec

In [3]:
corpus = PlaintextCorpusReader(root=".", 
                               fileids=["corpus.txt"])

In [4]:
unk_cutoff = 10

In [5]:
def preprocess_steps(corpus):
    # get sentences
    text = corpus.sents()

    processed_text = [
        [token.lower() for token in re.sub(r"[.,:;!?\-\'\"\(\)\[\]]+", ' ', " ".join(sentence)).split() if token != ""]
        for sentence in text
    ]

    return processed_text
    
preprocessed_corpus = preprocess_steps(corpus)

In [6]:
def create_restricted_voc(threshold=None):
    word_counts = Counter(list(chain.from_iterable(preprocessed_corpus))) # flatten the list
    
    preprocessed_corpus_unk = [['<UNK>' if word_counts[word] < threshold else word for word in sentence] for sentence in preprocessed_corpus]
    
    unique_tokens = set(list(chain.from_iterable(preprocessed_corpus_unk)))
    
    return preprocessed_corpus_unk, unique_tokens

In [7]:
preprocessed_corpus_unk, unique_tokens = create_restricted_voc(threshold=unk_cutoff)

#dictionary = corpora.Dictionary(preprocessed_corpus_unk)

In [8]:
print(len(preprocessed_corpus_unk))

121991


In [12]:
target_words = ["car", "feature", "computer"]

# initialize model with given parameters
word2vec_model = Word2Vec(
    sentences=preprocessed_corpus_unk, 
    vector_size=100, # dimensions
    window=2, 
    sg=1,  #skipgrams
    negative=10, 
    epochs=50,  
    workers=1 
)

In [10]:
def get_5_closest_words(target_words):
    similarities = {}

    for target_word in target_words:
        if target_word in word2vec_model.wv: # vectors
            similar_words = word2vec_model.wv.most_similar(target_word, topn=5)
            similarities[target_word] = similar_words
    
    
    for target, similar_words in similarities.items():
        print(f"5 words most similar to {target}:")
        print(set(similar_words[i][0] for i in range(5)))
        for word, similarity in similar_words:
            print(f"{word} : {similarity}".ljust(50))
        print()
        
    return similarities  

In [11]:
similar_to_targets = get_5_closest_words(target_words)

5 words most similar to car:
{'vehicles', 'ghia', 'cars', 'boat', 'tangara'}
cars : 0.586064338684082                          
ghia : 0.5160677433013916                         
boat : 0.5109049677848816                         
vehicles : 0.5091322064399719                     
tangara : 0.5040414929389954                      

5 words most similar to feature:
{'features', 'daioh', 'gedcom', 'featured', 'remixes'}
features : 0.5864483714103699                     
daioh : 0.5153237581253052                        
remixes : 0.492087721824646                       
featured : 0.49171215295791626                    
gedcom : 0.4878462553024292                       

5 words most similar to computer:
{'motorola', 'computing', 'programming', 'programmers', 'interactive'}
programmers : 0.6052811145782471                  
computing : 0.5372326970100403                    
interactive : 0.5279883742332458                  
motorola : 0.5242708921432495                     
programming : 