# <center>SVD vs Word2Vec</center>

Note: All the codes added/modified by me are enclosed withing the marker:<br> 
"####################################### Added by Rimmon #######################################"<br>
<center> <i>My Code</i></center><br>
"####################################### Added by Rimmon #######################################"

#### Importing the required libraries

In [1]:
### libraries for this question 
import nltk
import numpy as np
from nltk.corpus import reuters
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

#### Downloading and processing the data from NLTK

In [2]:
nltk.download('reuters')
nltk.download('stopwords')

nltk_data_path = '/home/v-labsai-rimmon-bhosale/nltk_data/'
file_loc = nltk_data_path+'corpora/reuters.zip'
from zipfile import ZipFile
with ZipFile(file_loc, 'r') as z:
  z.extractall(nltk_data_path+'corpora/')

[nltk_data] Downloading package reuters to /home/v-labsai-rimmon-
[nltk_data]     bhosale/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to /home/v-labsai-rimmon-
[nltk_data]     bhosale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### (a) Load the dataset

In [3]:
def load_data(category="money-fx"):
    """ 
    Read files from the specified Reuter's category.
    
    Params:
        category (string): category name
    
    Return:
        list_words (list[list]): list of list with words from each of the processed files/documents
    """
    files = reuters.fileids(category)
    list_words = []
    ### iterate over all documents/files from the reuters dataset
    for f in files:
      words = []
      ### iterate over all words of a document/file
      for w in list(reuters.words(f)):
        ####################################### Added by Rimmon #######################################
        ### The below condition skips the stop-words and converts each word (w) to lowercase        
        if w not in stopwords.words('english'):
           words.append(w.lower())
        ####################################### Added by Rimmon #######################################
      list_words.append(words)
    return list_words

# check a few samples of reuters corpus
reuters_corpus = load_data()
print(reuters_corpus[:3])

[['bundesbank', 'allocates', '6', '.', '1', 'billion', 'marks', 'in', 'tender', 'the', 'bundesbank', 'accepted', 'bids', '6', '.', '1', 'billion', 'marks', 'today', "'", 'tender', '28', '-', 'day', 'securities', 'repurchase', 'pact', 'fixed', 'rate', '3', '.', '80', 'pct', ',', 'central', 'bank', 'spokesman', 'said', '.', 'banks', ',', 'bid', 'total', '12', '.', '2', 'billion', 'marks', 'liquidity', ',', 'credited', 'funds', 'allocated', 'today', 'must', 'buy', 'back', 'securities', 'pledged', 'may', '6', '.', 'some', '14', '.', '9', 'billion', 'marks', 'drain', 'market', 'today', 'earlier', 'pact', 'expires', ',', 'bundesbank', 'effectively', 'withdrawing', 'net', '8', '.', '1', 'billion', 'marks', 'market', 'today', "'", 'allocation', '.', 'a', 'bundesbank', 'spokesman', 'said', 'answer', 'enquiries', 'withdrawal', 'funds', 'reflect', 'tightening', 'credit', 'policy', ',', 'seen', 'context', 'plentiful', 'liquidity', 'banking', 'system', '.', 'banks', 'held', 'average', '59', '.', '3

#### (b) Create co-occurrence matrix

In [4]:
def distinct_words(corpus):
    """ 
    get a list of distinct words for the corpus.

    Params:
        corpus (list[list[string]]): corpus of documents
    
    Return:
        corpus_words (list[string]): list of distinct words across the corpus, sorted (using python 'sorted' function) num_corpus_words (integer): number of distinct words across the corpus
    """

    corpus_words = set()  
     
    ####################################### Added by Rimmon #######################################
    # collecting all the unique words from the entire corpus using the set() data-structure
    [corpus_words.add(x) for y in corpus for x in y]
    # getting the number of unique words
    num_corpus_words = len(corpus_words)
    
    return list(sorted(corpus_words)), num_corpus_words
    ####################################### Added by Rimmon #######################################

words, num_words = distinct_words(reuters_corpus)
print(num_words,words)



In [5]:
def compute_co_occurrence_matrix(corpus, window_size=7):
    """ 
    Compute co-occurrence matrix for the given corpus and window_size (default of 7).    
    
    Params:
        corpus (list of list of strings): corpus of documents
        window_size (int): size of context window
    Return:
        M (numpy matrix of shape = [number of corpus words x number of corpus words]): 
            Co-occurence matrix of word counts. 
            The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
        word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """

    ####################################### Added by Rimmon #######################################
    # get the distinct words list from the entire corpus
    words, num_words = distinct_words(reuters_corpus)

    # inirtializing the co-occurance matrix
    M = np.zeros((num_words,num_words))

    # creating a dictionary mapping unique words to indices
    word2Ind = {w:i for i,w in enumerate(words)}

    # iterate over all the documents in the corpus
    for document in corpus:
        #iterate over all the words in the document
        for i, w in enumerate(document):
            # get the word index
            word_index = word2Ind[w]
            
            # get the words in the context of the current word within the window_size range
            start = max(0, i - window_size)
            end = min(len(document), i + window_size + 1)
            context = document[start:i] + document[i + 1:end]
            
            # update (add 1) the co-occurance matrix values for the words in the context of the current word
            for context_word in context:
                context_index = word2Ind[context_word]
                M[word_index, context_index] += 1

    ####################################### Added by Rimmon #######################################

    return M, word2Ind

M, word2Ind = compute_co_occurrence_matrix(reuters_corpus)
ind2word = {value:key for key,value in word2Ind.items()}

#### (c) SVD

In [6]:
####################################### Added by Rimmon #######################################
# Run SVD
# Note: This may take several minutes
svd = TruncatedSVD(n_components=75,n_iter=100)
embeddings = svd.fit_transform(M)
####################################### Added by Rimmon #######################################

#### (d) Word2Vec

In [7]:
####################################### Added by Rimmon #######################################
# Creating the model and setting values for the various parameters

# Initializing the train model
vector_size = 75  
window_size = 7    
min_count = 1     
W2V_model = word2vec.Word2Vec(sentences=reuters_corpus, vector_size=vector_size, window=window_size, min_count=min_count)
####################################### Added by Rimmon #######################################

#### (d) Compare SVD word embeddings with Word2Vec

In [8]:

####################################### Added by Rimmon #######################################
def svd_most_similar(query_word,embeddings,word2Ind,ind2word, n=10):
    """ 
    return 'n' most similar words of a query word using the SVD word embeddings similar to word2vec's most_smilar    
    
    Params:
        query_word (strings): a query word
        embeddings (ndarray): embeddings obtained after applying SVD to the co-occurance matrix
        word2Ind (dict): mapping between words and their indices
        ind2word (dict): reverse mapping of word2Ind
    Return:
        most_similar (list of strings): the list of 'n' most similar words
    """
    # converting the query word to lowercase
    query_word = query_word.lower()

    # if the word is not in the vocabulary, return None
    if query_word not in word2Ind.keys():
        print("Word not in the vocabulary!")
        return ["Word not in the vocabulary!"]

    # getting the index of the word
    query_ind = word2Ind[query_word]

    # retriving the embedding for the query_word
    query_embedding = embeddings[query_ind].reshape(1, -1)

    # calculating the cosine similarity between the given word and every other owrd in the vocabulary
    cosine_similarities = cosine_similarity(query_embedding, embeddings)
    
    # getting the list of similarity scores and the corresponding indices sorted according to the similarity score. Most similar words appear earlier.
    similarity_scores = np.sort(cosine_similarities)
    similarity_scores_indices = np.argsort(-cosine_similarities)

    # picking the top n words from the 
    top_n_similar_matches = similarity_scores_indices[0][1:n+1]

    # getting the most similar words using the reverese mapping in ind2word
    most_similar = [(ind2word[index],similarity_scores[0][index]/similarity_scores[0][query_ind]) for index in top_n_similar_matches]

    return most_similar
####################################### Added by Rimmon #######################################

## SVD vs Word2Vec: "???"

In [9]:
####################################### Added by Rimmon #######################################
num_matches = 5
query_words = ["morning","grow","tea","research","money"]
for word in query_words:
    svd_top_words = svd_most_similar(word,embeddings,word2Ind,ind2word,num_matches)
    w2v_top_words = W2V_model.wv.most_similar(word,topn=num_matches)
    print("\n\n")
    print(f"SVD's top {num_matches} matching words to \"{word}\"")
    print(svd_top_words)
    print(f"W2V's top {num_matches} matching words to \"{word}\"")
    print(w2v_top_words)
    print("\n\n")
####################################### Added by Rimmon #######################################






SVD's top 5 matching words to "morning"
[('today', 1.2169802379698085), ('at', 0.5609840957089377), ('late', 0.9605986572538985), ('early', 0.8184796072224603), ('earlier', 0.8184280224233537)]
W2V's top 5 matching words to "morning"
[('liquidity', 0.9966598749160767), ('afternoon', 0.9953506588935852), ('today', 0.9952619075775146), ('total', 0.9940673112869263), ('outflows', 0.9929744601249695)]






SVD's top 5 matching words to "grow"
[('imports', 1.0327460865634785), ('exports', 0.9371520811213614), ('risen', 1.3046619450951003), ('over', 1.1882730649207276), ('cost', 0.8054983988863252)]
W2V's top 5 matching words to "grow"
[('months', 0.9969308972358704), ('next', 0.9964560270309448), ('terms', 0.9962108135223389), ('index', 0.9962030053138733), ('ago', 0.9960575103759766)]






SVD's top 5 matching words to "tea"
[('engaging', 0.37093315844439345), ('societe', 0.8853776869497355), ('osman', 0.6618836757063716), ('wertheim', 1.233679023194907), ('mohammed', 0.62283368089433