# Models for language cleaning

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report

import glob
import re
import os

In [None]:
lexemes = set(open('data/Russian_lexemes.txt', encoding='utf-8').read().split('\n'))
print(f'There are {len(lexemes)} lexemes in Russian')

There are 168620 lexemes in Russian


## 1. Matrix reduction (MR)

In [None]:
def reduce_matrix(matrix, alpha):
    non_zero_counts = np.array(np.sum(matrix > 0, axis=0))[0]
    quantile = np.quantile(non_zero_counts, alpha)

    reduced_indexes = np.where(non_zero_counts <= quantile)[0]
    keeped_indexes = np.where(non_zero_counts > quantile)[0]

    return [reduced_indexes, keeped_indexes, quantile]

In [None]:
data = np.load('data/Russian/matrices/TF-IDF.npy', allow_pickle=True)
W, words = data[0], data[1]
W.shape

In [None]:
alphas = (0.05, 0.25, 0.4, 0.5, 0.55, 0.6, 0.75, 0.8, 0.85 ,0.9, 0.95, 0.975)

f = open('language_cleaning/MR/MR_report.txt', 'w')

for alpha in alphas:
    f.write(f'\nQuantile of {alpha} level: {quantile}\n')
    os.makedirs(f'language_cleaning/MR/{alpha}')
    reduced_indexes, keeped_indexes, quantile = reduce_matrix(W, alpha)

    for fname, indexes in zip(('words', 'rubbish'), (keeped_indexes, reduced_indexes)):
        sample = words[indexes]

        open(f'language_cleaning/MR/{alpha}/{fname}.txt', 'w').write('\n'.join(sample))

        sample_share = round(100 * len(sample) / len(words), 1)
        lexemes_in_sample = len(lexemes & set(sample))
        lexemes_share = round(100 * lexemes_in_sample / len(sample), 1)

        # sample size (its share of number of words), number of lexemes (its share of number of words)
        f.write(
            f'Length of {fname} list: {len(sample)} ({sample_share}%), {lexemes_in_sample} lexemes ({lexemes_share}%)\n'
        )

## 2. Median neighbour popularity (MNP)

In [None]:
def select_words_by_neighbours_median(lang, gamma, words):
    fnames = np.array(glob.glob(f'data/{lang}/neighbours/*'))
    start_indicies = np.array([int(re.search('[0123456789]+', fname.split('/')[-1]).group()) for fname in fnames])
    fnames = fnames[np.argsort(start_indicies)]
    neighbours_data = [np.load(fname, allow_pickle=True) for fname in fnames]

    stat, word_index = [[], 0]
    for sample in neighbours_data:
        for word_data in sample:
            assert word_index == word_data[0]
            stat.append(word_data[1])
            word_index += 1
    stat = np.array(stat)

    return [
        words[np.argsort(stat[:,0])][:int(len(words) * gamma)],
        words[np.argsort(stat[:,0])][int(len(words) * gamma):]
    ]

In [None]:
words = np.load('data/Russian/matrices/TF-IDF.npy', allow_pickle=True)[1]

In [None]:
gammas = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)

f = open('language_cleaning/MNP/MNP_report.txt', 'w')

for gamma in gammas:
    f.write(f'\ngamma = {gamma}\n')
    os.makedirs(f'language_cleaning/MNP/{gamma}')
    garbage_words, good_words = select_words_by_neighbours_median('Russian', gamma, words)

    for fname, sample in zip(('rubbish', 'words'), (garbage_words, good_words)):    
        open(
            f'language_cleaning/MNP/{gamma}/{fname}.txt', 'w'
        ).write('\n'.join(sorted(sample)))

        sample_share = round(100 * len(sample) / len(words), 1)
        lexemes_in_sample = len(lexemes & set(sample))
        lexemes_share = round(100 * lexemes_in_sample / len(sample), 1)

        # sample size (its share of number of words), number of lexemes (its share of number of words)
        f.write(
            f'Length of {fname} list: {len(sample)} ({sample_share}%), {lexemes_in_sample} lexemes ({lexemes_share}%)\n'
        )

## 3. DBSCAN

In [None]:
embedding_dict = np.load('data/Russian_dict_SVD_32.npy', allow_pickle=True).item()
words = np.array(list(embedding_dict.keys()))
embeddings = np.vstack([embedding_dict[word] for word in words])

print(f'There are {len(words)} words and {len(lexemes & set(words))} lexemes in dict')

There are 122667 words and 53588 lexemes in dict


In [None]:
epsilons = (
    0.001, 0.002, 0.003, 0.004, 0.005, 0.01, 0.02,
    0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1
)

for epsilon in epsilons:
    dbscan = DBSCAN(eps=epsilon, n_jobs=-1).fit(embeddings)
    labels = dbscan.labels_ - np.min(dbscan.labels_)

    os.makedirs(f'language_cleaning/DBSCAN/{epsilon}')
    for label in np.unique(labels):
        open(
            f'language_cleaning/DBSCAN/{epsilon}/{label}.txt', 'w'
        ).write('\n'.join(words[labels == label]))

In [None]:
epsilons = sorted(os.listdir('language_cleaning/DBSCAN'))

f = open('language_cleaning/DBSCAN/DBSCAN_report.txt', 'w')

for epsilon in epsilons:
    f.write(f'\nepsilon = {epsilon}\n')
    labels = sorted([
        label[:-4] for label in os.listdir(f'language_cleaning/DBSCAN/{epsilon}')
    ])

    for label in labels:
        sample = open(f'language_cleaning/DBSCAN/{epsilon}/{label}.txt').read().split('\n')

        sample_share = round(100 * len(sample) / embeddings.shape[0])
        lexemes_in_sample = len(lexemes & set(sample))
        lexemes_share = round(100 * lexemes_in_sample / len(sample))

        # sample size (its share of number of words), number of lexemes (its share of number of words)
        f.write(
            f'{label}: {len(sample)} words ({sample_share}%), {lexemes_in_sample} lexemes ({lexemes_share}%)\n'
        )

## Integral model

In [None]:
words = np.load('languages/Russian/matrices/TF-IDF.npy', allow_pickle=True)[1]

rubbish = [
    open(
        f'language_cleaning/{rubbish_path}.txt'
    ).read().split('\n') for rubbish_path in (
        'MR/0.6/rubbish', 'MNP/0.1/rubbish', 'DBSCAN/0.002/0'
    )
]

models = ('matrix', 'neighs', 'dbscan')
for i in range(len(models)):
    for j in range(i + 1, len(models)):
        curr_rubbish = set(rubbish[i]) & set(rubbish[j])
        y_true = [int(word not in lexemes) for word in words]
        y_pred = [int(word in curr_rubbish) for word in words]

        print('\n\n' + models[i] + ' & ' + models[j])
        print(classification_report(y_true, y_pred))



matrix & neighs
              precision    recall  f1-score   support

           0       0.61      0.98      0.75     70120
           1       0.84      0.15      0.26     50803

    accuracy                           0.63    120923
   macro avg       0.73      0.56      0.51    120923
weighted avg       0.71      0.63      0.55    120923



matrix & dbscan
              precision    recall  f1-score   support

           0       0.58      0.95      0.72     70120
           1       0.47      0.06      0.11     50803

    accuracy                           0.58    120923
   macro avg       0.53      0.51      0.42    120923
weighted avg       0.54      0.58      0.47    120923



neighs & dbscan
              precision    recall  f1-score   support

           0       0.58      1.00      0.73     70120
           1       0.56      0.00      0.01     50803

    accuracy                           0.58    120923
   macro avg       0.57      0.50      0.37    120923
weighted avg       0