# Pipeline for language cleaning

In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from multiprocessing import Pool
import shutil
import glob
import re
import os
import io

### Deleting translations from corpus

In [None]:
def delete_translations_from_corpus(lang):
    data = pd.read_excel('language_cleaning/translations_list.xlsx', header=0, index_col=None)
    fnames_to_keep = data[lang].values.astype(str)
    fnames_to_keep = fnames_to_keep[fnames_to_keep != 'nan'].astype(float).astype(int).astype(str)

    fnames = [fname[:-4] for fname in os.listdir(f'data/{lang}/{lang}_corpus_processed')]
    assert fnames_to_keep.shape[0] == len(set(fnames).intersection(set(fnames_to_keep)))

    os.makedirs(f'data/{lang}/{lang}_corpus_processed_clean')

    for fname_to_keep in fnames_to_keep:
        shutil.copy2(
            f'data/{lang}/{lang}_corpus_processed/{fname_to_keep}.txt',
            f'data/{lang}/{lang}_corpus_processed_clean/{fname_to_keep}.txt'
        )

### TF-IDF matrix computation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def make_tf_idf_matrix(lang):
    open(f'data/{lang}/{lang}_corpus_processed_clean.txt', 'w').write(
        '\n'.join([
            open(
                f'data/{lang}/{lang}_corpus_processed_clean/{fname}'
            ).read() for fname in os.listdir(
                f'data/{lang}/{lang}_corpus_processed_clean'
            )
        ])
    )

    corpus = open(f'data/{lang}/{lang}_corpus_processed_clean.txt', 'r')
    vectorizer = TfidfVectorizer(analyzer='word', min_df=3, token_pattern='(?u)\\b\\w+\\b', lowercase=True)
    data_vectorized = vectorizer.fit_transform(corpus)

    np.save(f'data/{lang}/matrices/TF-IDF.npy', np.asarray(
        (data_vectorized, vectorizer.get_feature_names_out()), dtype=object
    ))

### MNP rubbish

In [None]:
def stat_count(args):
    word_index, n_text, x, y = args

    text_inds = x[y == word_index]
    neig = y[np.isin(x, text_inds) & (y != word_index)]
    stat = np.median(n_text[neig]), np.mean(n_text[neig])

    return word_index, stat

def calc_neighbours_median_popularity(lang):
    tf_idf_matrix = np.load(f'data/{lang}/matrices/TF-IDF.npy', allow_pickle=True)[0]
    n_text = np.array(np.sum(tf_idf_matrix, axis=0))[0]

    x, y = tf_idf_matrix.nonzero()
    checking = sorted(np.unique(y))

    step = 5000
    for start in range(0, len(checking), step):
        finish = min(len(checking), start + step)
        if f'{start}_{finish}.npy' in os.listdir(f'data/{lang}/neighbours'): continue

        pool = Pool(processes=8)
        result = pool.map(stat_count, list(zip(
            checking[start:finish], [n_text for _ in range(finish - start)],
            [x for _ in range(finish - start)], [y for _ in range(finish - start)]
            ))
        )
        pool.close()
        pool.join()

        np.save(f'data/{lang}/neighbours/{start}_{finish}.npy', np.asarray(result, dtype=object))

In [None]:
def select_words_by_neighbours_median(lang):
    fnames = np.array(glob.glob(f'data/{lang}/neighbours/*'))
    start_indicies = np.array([int(re.search('[0123456789]+', fname.split('/')[-1]).group()) for fname in fnames])
    fnames = fnames[np.argsort(start_indicies)]
    neighbours_data = [np.load(fname, allow_pickle=True) for fname in fnames]

    stat, word_index = [[], 0]
    for sample in neighbours_data:
        for word_data in sample:
            assert word_index == word_data[0]
            stat.append(word_data[1])
            word_index += 1
    stat = np.array(stat)

    words = np.load(f'data/{lang}/matrices/TF-IDF.npy', allow_pickle=True)[1]
    return words[np.argsort(stat[:,0])][:int(len(words) * 0.1)]

### MR rubbish

In [None]:
def reduce_matrix(lang, alpha=0.6):
    data = np.load(f'data/{lang}/matrices/TF-IDF.npy', allow_pickle=True)
    tf_idf_matrix, words = data[0], data[1]

    non_zero_counts = np.array(np.sum(tf_idf_matrix > 0, axis=0))[0]
    quantile = np.quantile(non_zero_counts, alpha)
    # print(f'\nQuantile of {alpha} level: {quantile}')

    return words[np.where(non_zero_counts <= quantile)[0]]

### SVD computation

In [None]:
from scipy.sparse.linalg import svds

def compute_SVD(lang, k_max=1000):
    tf_idf_matrix = np.load(f'data/{lang}/matrices/TF-IDF_clean.npy', allow_pickle=True)[0]

    # Computing SVD:
    k = min(k_max, min(tf_idf_matrix.shape) - 1) # 0 < k < min(A.shape)
    u, sigma, vt = svds(tf_idf_matrix, k)

    # Sorting singular values in descending order (function doesn't garantee it):
    descending_order_of_inds = np.flip(np.argsort(sigma))
    u = u[:,descending_order_of_inds]
    vt = vt[descending_order_of_inds]
    sigma = sigma[descending_order_of_inds]

    # Checking that sizes are correct:
    assert sigma.shape == (k,)
    assert vt.shape == (k, tf_idf_matrix.shape[1])
    assert u.shape == (tf_idf_matrix.shape[0], k)

    # Saving the matrices:
    matrices_names = ('sigma_vt', 'sigma', 'u', 'vt')
    matrices = (np.dot(np.diag(sigma), vt).T, sigma, u, vt)

    for matrix_name, matrix in zip(matrices_names, matrices):
        with open(f'data/{lang}/matrices/{k}_{matrix_name}.npy', 'wb') as f:
            np.save(f, matrix)

### SVD embeddings computation

In [None]:
def create_SVD_dictionary(lang):
    words = np.load(f'data/{lang}/matrices/TF-IDF_clean.npy', allow_pickle=True)[1]
    sigma_vt = np.load(glob.glob(f'data/{lang}/matrices/*_sigma_vt.npy')[0], allow_pickle=True)

    dictionary = dict([[word, vector] for word, vector in zip(words, sigma_vt)])

    np.save(f'data/{lang}/{lang}_dict_SVD_{sigma_vt.shape[1]}.npy', dictionary)

### CBoW embeddings computation

In [None]:
def create_CBoW_dictionary(lang, k=100):
    fin = io.open(
        f'data/{lang}/{lang}_corpus_processed_clean', 'r',
        encoding='utf-8', newline='\n', errors='ignore'
    )
    documents = [line.split() for line in fin]
    
    model = Word2Vec(sentences=documents, vector_size=k, min_count=1)
    dictionary = {key : model.wv[key] for key in model.wv.key_to_index}

    np.save(f'data/{lang}/{lang}_dict_CBoW_{k}.npy', dictionary)

### Language cleaning

In [None]:
def clean_language(lang):
    # Deleting translations from corpus:
    delete_translations_from_corpus(lang=lang)

    # Computing TF-IDF matrix:
    os.makedirs(f'data/{lang}/matrices')
    make_tf_idf_matrix(lang=lang)

    # Computing garbage words with 2 methods:
    os.makedirs(f'data/{lang}/neighbours')
    calc_neighbours_median_popularity(lang=lang)
    garbage_words_neighs_median = select_words_by_neighbours_median(lang=lang)
    garbage_words_matrix_reduction = reduce_matrix(lang=lang)

    garbage_words = set(garbage_words_neighs_median).union(garbage_words_matrix_reduction)
    with open(f'data/{lang}/{lang}_garbage_words.txt', 'w') as fout:
        fout.write('\n'.join(sorted(garbage_words)))

    # Writing good words:
    data = np.load(f'data/{lang}/matrices/TF-IDF.npy', allow_pickle=True)
    tf_idf_matrix, words = data[0], data[1]

    clean_words = sorted(list(set(words).difference(garbage_words)))
    with open(f'data/{lang}/{lang}_good_words.txt', 'w') as fout:
        fout.write('\n'.join(clean_words))

    # Computing clean TF-IDF matrix:
    word_list = list(words)
    kept_indicies = np.array([word_list.index(clean_word) for clean_word in clean_words])
    for i in range(len(kept_indicies) - 1):
        assert kept_indicies[i] < kept_indicies[i + 1]

    np.save(f'data/{lang}/matrices/TF-IDF_clean.npy', np.asarray(
        (tf_idf_matrix[:, kept_indicies], words[kept_indicies]), dtype=object
    ))

    # Computing SVD and creating dictionaries:
    compute_SVD(lang=lang)
    create_SVD_dictionary(lang=lang)
    create_CBoW_dictionary(lang=lang, k=8)

In [None]:
clean_language('Russian')