# TFIDF matrix

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
def make_matrix_W_list_of_words(corpus_path, min_df, max_df=None, token_pattern = None, use_idf = True):
    with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names_out()

In [31]:
W, words_list  = make_matrix_W_list_of_words('origin_corpus_oss.txt', 1)

In [32]:
W.shape

(1201, 866485)

# SVD

In [2]:
from scipy.sparse.linalg import svds
import numpy as np

The code below will apply the rank k SVD decomposition and save all the matrixes in the given folder

In [34]:
def apply_svd(W, k, output_folder):
  #Apply the SVD function
    u, sigma, vt = svds(W, k)

  #The function does not garantee, that the order of the singular values is descending
  #So, we need to dreate it by hand
    descending_order_of_inds = np.flip(np.argsort(sigma))
    u = u[:,descending_order_of_inds]
    vt = vt[descending_order_of_inds]
    sigma = sigma[descending_order_of_inds]

  #Checking that sizes are ok
    assert sigma.shape == (k,)
    assert vt.shape == (k, W.shape[1])
    assert u.shape == (W.shape[0], k)
  
  #Now, we'll save all the matrixes in folder (just in case)
    with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
        np.save(f, np.dot(np.diag(sigma), vt).T)
    with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
        np.save(f, sigma)
    with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
        np.save(f, u)
    with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
        np.save(f, vt)
    return np.dot(np.diag(sigma), vt).T

In [35]:
vv = apply_svd(W, 20, '.')

In [36]:
def create_dictionary(words_list, vv, output_file):
    dictionary = {}
    for word, vector in zip(words_list, vv):
        dictionary[word] = vector
    np.save(output_file, dictionary)
    return dictionary

In [37]:
dictionary = create_dictionary(words_list, vv, 'oss_dictionary_svd_20.npy')

In [3]:
word2vec = np.load('oss_dictionary_svd_20.npy', allow_pickle=True).item()

In [11]:
from itertools import islice
dict(islice(word2vec.items(), 20000, 20003))

{'æгкары': array([-8.62071721e-05,  1.16585695e-05, -6.71802157e-05,  5.43344483e-05,
         1.61263119e-05, -1.98442873e-06,  6.45240084e-05,  1.33016933e-04,
        -1.31998968e-05,  1.38609929e-04, -4.16983993e-05,  1.42978655e-04,
         2.18714099e-05,  2.03271663e-04,  8.78132652e-06,  1.96892200e-04,
         1.20474634e-04,  6.37786330e-05,  4.49281474e-05, -2.05907385e-04]),
 'æгкаты': array([-1.08221203e-03,  1.22046639e-04, -6.47724952e-04, -1.51051734e-03,
         3.90179106e-04, -3.52363590e-05,  4.03837431e-04, -4.15651615e-04,
         1.55965607e-04,  4.15703441e-04,  6.18815527e-04,  2.34908765e-04,
        -1.81547758e-04, -2.93622849e-04,  2.96508302e-04, -4.62957465e-04,
        -4.24050356e-04,  5.92263847e-04,  5.92230650e-04,  1.29227670e-06]),
 'æгкац': array([-1.13745089e-04, -9.85985350e-06, -6.68491018e-05,  7.40437151e-06,
        -3.20991358e-04,  3.91413307e-05,  1.69594971e-04,  1.38330419e-04,
        -6.65125885e-06, -2.00279173e-05, -2.20827544e-