# TFIDF matrix

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
def make_matrix_W_list_of_words(corpus_path, min_df, max_df=None, token_pattern = None, use_idf = True):
    with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names_out()

In [11]:
W, words_list  = make_matrix_W_list_of_words('origin_corpus_ce.txt', 1)

In [12]:
W.shape

(20254, 886273)

# SVD

In [13]:
from scipy.sparse.linalg import svds
import numpy as np

The code below will apply the rank k SVD decomposition and save all the matrixes in the given folder

In [14]:
def apply_svd(W, k, output_folder):
    #Apply the SVD function
    u, sigma, vt = svds(W, k)

    descending_order_of_inds = np.flip(np.argsort(sigma))
    u = u[:,descending_order_of_inds]
    vt = vt[descending_order_of_inds]
    sigma = sigma[descending_order_of_inds]

  #Checking that sizes are ok
    assert sigma.shape == (k,)
    assert vt.shape == (k, W.shape[1])
    assert u.shape == (W.shape[0], k)
  
  #Now, we'll save all the matrixes in folder (just in case)
    with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
        np.save(f, np.dot(np.diag(sigma), vt).T)
    with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
        np.save(f, sigma)
    with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
        np.save(f, u)
    with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
        np.save(f, vt)
    return np.dot(np.diag(sigma), vt).T

In [15]:
vv = apply_svd(W, 20, '.')

In [16]:
def create_dictionary(words_list, vv, output_file):
    dictionary = {}
    for word, vector in zip(words_list, vv):
        dictionary[word] = vector
    np.save(output_file, dictionary)
    return dictionary

In [17]:
dictionary = create_dictionary(words_list, vv, 'ce_dictionary_svd_20.npy')