In [2]:
import glob
import tqdm
def make_corpus(input_path, output_file_path):

    file_list = sorted(glob.glob(input_path + '/*'))
    with open(output_file_path, 'w') as output_file:
        for file in tqdm.tqdm(file_list):
            with open(file, 'r') as input_file:
                output_file.write(input_file.read().replace('\n', ' '))
                output_file.write('\n')

from sklearn.feature_extraction.text import TfidfVectorizer
def make_matrix_W_list_of_words(corpus_path, min_df, max_df=None, token_pattern = None, use_idf = True):
    with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names_out()

In [8]:
make_corpus('stemmed_hebrew_literature', 'corpus_hebrew_literature_stem.txt')

100%|█████████████████████████████████████| 6494/6494 [00:03<00:00, 2145.50it/s]


In [3]:
token_pattern = 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ'

In [11]:
W, words_list  = make_matrix_W_list_of_words('corpus_hebrew_literature_stem.txt', 1)

In [12]:
W.shape

(6494, 795170)

In [4]:
make_corpus('cleaned', 'corpus_dirty_hebrew.txt')
W, words_list  = make_matrix_W_list_of_words('corpus_dirty_hebrew.txt', 1)
W.shape

100%|█████████████████████████████████████| 6494/6494 [00:03<00:00, 1624.56it/s]


(6494, 795170)

In [6]:
from scipy.sparse.linalg import svds
import numpy as np

In [7]:
def apply_svd(W, k, output_folder):
    '''
    W - matrix texts x words
    k - the rank of the SVD, must be less than any dimension of W
    '''
    u, sigma, vt = svds(W, k)
    
    descending_order_of_inds = np.flip(np.argsort(sigma))
    u = u[:,descending_order_of_inds]
    vt = vt[descending_order_of_inds]
    sigma = sigma[descending_order_of_inds]

    assert sigma.shape == (k,)
    assert vt.shape == (k, W.shape[1])
    assert u.shape == (W.shape[0], k)

    with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
        np.save(f, np.dot(np.diag(sigma), vt).T)
    with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
        np.save(f, sigma)
    with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
        np.save(f, u)
    with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
        np.save(f, vt)
    return np.dot(np.diag(sigma), vt).T

In [None]:
vv = apply_svd(W, 1024, '/svd_files')