In [1]:
#!/usr/bin/env python
# coding: utf-8
#===============================================================================
#
#           FILE: Tf_Idf_ntb.py 
#         AUTHOR: Bianca Ciobanica
#	       EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.11.4
#        CREATED: 13-11-2023 
#
#===============================================================================
#    DESCRIPTION:  sources used : 
#                  https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
#                  https://jofrhwld.github.io/teaching/courses/2022_lin517/lectures/word_vectors/02_vectors_examples.html
#                  https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.dok_matrix.html
#                  https://docs.python.org/3/library/stdtypes.html#frozenset.union
#    
#          USAGE: 
#===============================================================================

In [2]:
import re
import time
from itertools import chain
import math
from nltk.corpus.reader import PlaintextCorpusReader
from nltk.lm import Vocabulary
from collections import Counter
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [3]:
start_time_program = time.time()

In [4]:
corpus = PlaintextCorpusReader(root=".", 
                               fileids=["corpus.txt"])

In [5]:
#print(len(corpus.raw()))

In [6]:
def preprocess_steps(corpus):
    # get words
    text = corpus.sents()
    
    processed_text = [
        [token.lower() for token in re.sub(r"[.,:;!?\-\'\"\(\)\[\]]+", ' ', " ".join(sentence)).split() if token != ""]
        for sentence in text
    ]

    words = list(chain.from_iterable(processed_text))
    
    return words
    
words = preprocess_steps(corpus)

In [7]:
# ~~~~~ Create restricted voc ~~~~~
unk_cutoff = 10
#freqDist = Counter(sorted(words)) # sort alphabetically beforehand

#top_5_words = [item[0] for item in freqDist.most_common()[:5]] # get top 5
#print("\n".join(top_5_words))

In [8]:
def create_restricted_voc(threshold=None):
    word_counts = Counter(words)
    
    restricted_voc = ['<UNK>' if word_counts[word] < threshold else word for word in words]
    
    unique_tokens = list(set(restricted_voc))
    
    return restricted_voc, unique_tokens

In [9]:
def create_index(corpus, unique_tokens):
    words_index = {word: [] for word in unique_tokens}
    
    for idx, word in enumerate(corpus):
        words_index[word].append(idx)
    
    return words_index

In [10]:
words, unique_tokens = create_restricted_voc(threshold=unk_cutoff) # oov words replaced with unk in words array
words_index = create_index(words, unique_tokens)

In [11]:
voc_size = len(unique_tokens)
print("unique tokens :",voc_size)

unique tokens : 15200


In [12]:
# ~~~~~ Contextual windows  ~~~~~
# w1 = words[0]
# w2 = computer
# w3 = words[-2]
def window(i):
    if i == 0: # first word
        return [words[i+1], words[i+2]] 
    if len(words) - i == 2: # penultimate word
        return [words[i-2], words[i-1], words[i+1]]
    if len(words) - i == 1: # last word
        return [words[i-2], words[i-1]]
                
    return [words[i-2], words[i-1], words[i+1], words[i+2]]
    
#print(",".join(window(0)))
#print(",".join(window(words.index("person"))))
#print(",".join(window(len(words) - 2)))


In [13]:
def co_occurrence_matrix(target_words):
    start_time = time.time()
    
    matrix = {target_word: Counter() for target_word in target_words} 
    
    for word in target_words:
        # get co-occurence vector for target word
        for idx in target_words[word]: # position in original corpus
            target_context_words = window(idx)
            
            # update counts
            matrix[word].update(target_context_words) # we feed the context words to the counter object

    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"Execution time: {execution_time} seconds")
    return matrix
    
term_context_matrix = co_occurrence_matrix(words_index)

Execution time: 1.8197307586669922 seconds


In [14]:
def create_tfidf_matrix(matrix):
    start_time = time.time()
    """ 
        input : non sparse matrix where matrix is a dict of dicts
                each word has a counter (its vector)
        returns : tfidf matrix
    """
    
    # tf_d = log10(count(t, d) + 1)
    # idf_d =  log10 (n = V size / df)
    for word in matrix:
        for cword, count in matrix[word].items():
            tf = math.log10(count + 1)
            df = len( matrix[word])
            idf = math.log10(voc_size / df)
            matrix[word][cword] = tf*idf

    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"Execution time: {execution_time} seconds")
    
    return matrix

tfidf_matrix = create_tfidf_matrix(term_context_matrix)

Execution time: 0.4287431240081787 seconds


In [15]:
def co_occurrence_matrix_sparse(target_words):
    start_time = time.time()
    matrix = {target_word: Counter() for target_word in target_words} 
    
    for word in target_words:
        # get co-occurence vector for target word
        for idx in target_words[word]: # position in original corpus
            target_context_words = window(idx)
            
            # update counts
            matrix[word].update(target_context_words) # we feed the context words to the counter object

    matrix_data_dict = list(matrix.values()) # is a list of counters, so word => vector 

    # create a df from iterable dict
    occurences_matrix = pd.DataFrame(matrix_data_dict, index=list(target_words), columns=list(target_words))
    occurences_matrix = occurences_matrix.fillna(0).astype(int) # fill 0 for words not occuring with others (no!!!)

    end_time = time.time()
    execution_time = end_time - start_time
    
    print(f"Execution time: {execution_time} seconds")
    return occurences_matrix
    
#term_context_matrix_sparse = co_occurrence_matrix_sparse(words_index)

In [16]:
def create_tfidf_sparse(matrix):
    # tf_d = log10(count(t, d) + 1)
    # idf_d =  log10 (n = V size / df)
    tf_matrix_sparse = np.log10(matrix + 1)

    #calculate df, we multiply each frequency by the vector df_i
    df_vector_sparse = np.asarray((tf_matrix_sparse > 0).sum(axis=1))
    idf_vector_sparse = np.log10(voc_size / (df_vector_sparse ))
    
    tfidf_matrix_sparse = tf_matrix_sparse.mul(idf_vector_sparse, axis=0)

    return tfidf_matrix_sparse
#tfidf_matrix_sparse = create_tfidf_sparse(term_context_matrix_sparse)

In [17]:
def get_cosine_sim(word1, word2):
    """ input : two vectors of two words
        return : cosine similarity value
    """
    vector1 = tfidf_matrix[word1]
    vector2 = tfidf_matrix[word2]
    
    dot_product = 0

    common_words = set(vector1.keys()) & set(vector2.keys())
    for word in common_words:
        dot_product += vector1[word]  * vector2[word]

    v1_norm = np.linalg.norm(list(c for c in vector1.values()))
    v2_norm = np.linalg.norm(list(c for c in vector2.values()))
    # or without numpy
    #v1_norm = math.sqrt(sum(map(lambda x: x*x, vector1.values())))
    #v2_norm = math.sqrt(sum(map(lambda x: x*x, vector2.values())))
    total_counts = v1_norm * v2_norm

    normalized_dotprod = dot_product / total_counts
    
    return normalized_dotprod

In [18]:
def get_cosine_sparse(word1,word2):

    vector1 = tfidf_matrix_sparse.loc[word1]
    vector2 = tfidf_matrix_sparse.loc[word2]

    dot_product = np.inner(vector1, vector2)

    v1_norm = np.linalg.norm(vector1)
    v2_norm = np.linalg.norm(vector2)

    # or without numpy
    #v1_norm = math.sqrt(sum(map(lambda x: x*x, vector1)))
    #v2_norm = math.sqrt(sum(map(lambda x: x*x, vector2)))
    total_counts = v1_norm * v2_norm

    normalized_dotprod = dot_product / total_counts
    
    return normalized_dotprod

In [19]:
def get_5_closest_words(target):
    similarities = {}
   # similarities_sparse = {}
    
    for w in unique_tokens:
        if w == target:
            continue
        sim = get_cosine_sim(target,w)
       # sim_sparse = get_cosine_sparse(target,w)

        similarities[w] = sim
      #  similarities_sparse[w] = sim_sparse
                                         
    top_5_words = sorted(similarities, key=similarities.get, reverse=True)[:5]
    #top_5_words_sparse = sorted(similarities, key=similarities.get, reverse=True)[:5]
    
    print("Top 5 words similar to " + target)
    print("From scratch and no sparse")
   # print("From scratch and no sparse".ljust(50), "Sparse matrix\n")
    for w in top_5_words:
        print(f"{w}: {similarities[w]}")
   # for w, w_sparse in zip(top_5_words, top_5_words_sparse):
   #     print(f"{w}: {similarities[w]}".ljust(50), 
    #          f"{w_sparse}: {similarities_sparse[w_sparse]}")
    print()
    
    return set(top_5_words)

In [20]:
car_closest = get_5_closest_words('car')
feature_closest = get_5_closest_words('feature')
computer_closest = get_5_closest_words('computer')

Top 5 words similar to car
From scratch and no sparse
cars: 0.5459432013009844
issue: 0.5350757773255332
built: 0.5338796657113848
position: 0.5327571649308895
meeting: 0.5292846421623635

Top 5 words similar to feature
From scratch and no sparse
issue: 0.5509324369691192
event: 0.5497552268988785
appearance: 0.548460352322813
featured: 0.5469790860048481
features: 0.5417038249595428

Top 5 words similar to computer
From scratch and no sparse
technology: 0.5178624163693614
software: 0.5084677660785514
system: 0.4878365169166404
programming: 0.4847636197099709
game: 0.4830950880372917



In [21]:
print("5 closest words to car: \n", car_closest)
print("5 closest words to feature: \n",feature_closest)
print("5 closest words to computer: \n",computer_closest)

5 closest words to car: 
 {'issue', 'position', 'meeting', 'built', 'cars'}
5 closest words to feature: 
 {'event', 'featured', 'issue', 'features', 'appearance'}
5 closest words to computer: 
 {'technology', 'programming', 'system', 'software', 'game'}


In [22]:
end_time_program = time.time()
total_execution_time = end_time_program - start_time_program
    
print(f"Total execution time: {total_execution_time} seconds")

Total execution time: 8.554362058639526 seconds
