# Получение словаря языка Кечуа

### 1. Получение названий статей

In [None]:
import urllib.request as url
from bs4 import BeautifulSoup
import re
import time

In [None]:
def encode(string, code_dict):
    pattern = re.compile("|".join(map(re.escape, code_dict.keys())))
    return pattern.sub(lambda match: code_dict[match.group(0)], string)

In [None]:
f = open('drive/MyDrive/Quechua/links.txt', 'r')
links = f.read()
f.close()

qu_wiki_url = 'https://qu.wikipedia.org/w/index.php?title=Sapaq:TukuyPanqakuna&from='
links = encode(links, {qu_wiki_url: '', '\n': ' '})

f = open('drive/MyDrive/Quechua/sections.txt', 'w')
f.write(links)
f.close()

In [None]:
encoding_dict = {'ñ': '%C3%B1', 'ó': '%C3%B3', 'ø': '%C3%B8', 'á': '%C3%A1',
                 'é': '%C3%A9', 'í': '%C3%AD', 'ž': '%C5%BE', "\'": '%27',
                 '(': '%28', ')': '%29', ',': '%2C', ':': '%3A'}
articles_names = ''

f = open('drive/MyDrive/Quechua/sections.txt', 'r')
sections = encode(f.read(), encoding_dict).split(' ')
f.close()

for section in sections:
    f = url.urlopen(qu_wiki_url + section)
    articles = f.read()
    f.close()

    articles = BeautifulSoup(articles).get_text()
    articles = encode(articles, {**{'\n': '~', ' ': '_'}, **encoding_dict})
    articles = articles.split('~')[273:-40]
    articles[0] = section
    articles_names += ' '.join(articles)

    time.sleep(0.5)

f = open('drive/MyDrive/Quechua/articles_names.txt', 'w')
f.write(articles_names)
f.close()

articles_names = articles_names.split(' ')
print(f'Количество найденных статей: {len(articles_names)}')

Количество найденных статей: 41566


### 2. Получение статей

In [None]:
!pip install PyPDF2
!pip install pycryptodome==3.15.0

In [None]:
import urllib.request as url
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import PyPDF2 as pdf
import os

In [None]:
def remove_substr(s, start, end):
    i = s.find(start)
    j = s.find(end, i) + len(end)
    s = s[:i] + s[j:]
    return s

In [None]:
qu_wiki_url = 'https://qu.wikipedia.org/wiki/'
drive_texts_url = 'drive/MyDrive/Quechua/Texts/'

f = open('drive/MyDrive/Quechua/articles_names.txt', 'r')
articles_names = f.read().split(' ')
f.close()

remove_labels = ((' ', 'Wikipediamanta'), ('Katiguriya', 'contenido'),
                 ('\"https', '(Wikipedia, Qhichwa / Quechua)'))

for article_name in articles_names:
    try:
        f = url.urlopen(qu_wiki_url + article_name)
    except (UnicodeEncodeError, HTTPError, FileNotFoundError):
        continue

    text = f.read()
    text = BeautifulSoup(text).get_text()

    for start, end in remove_labels:
        text = remove_substr(text, start, end)

    try:
        f = open(drive_texts_url + article_name + '.txt', 'w')
    except FileNotFoundError:
        article_name = article_name.replace('/', '_')
        f = open(drive_texts_url + article_name + '.txt', 'w')
    f.write(text)
    f.close()

articles_names = os.listdir(drive_texts_url[:-1])
print(f'Количество доступных статей: {len(articles_names)}')

Количество доступных статей: 39981


### 3. Добавление pdf-файлов

In [None]:
!pip install PyPDF2
!pip install pycryptodome==3.15.0

In [None]:
import PyPDF2 as pdf
import os

In [None]:
pdf_names = ('dialnet', 'dictionary', 'evangelioc', 'manual', 'santa_clara')

drive_texts_url = 'drive/MyDrive/Quechua/Texts/'

for pdf_name in pdf_names:
    pdf_file = open(f'drive/MyDrive/Quechua/PDF/{pdf_name}.pdf', 'rb')
    pdf_reader = pdf.PdfReader(pdf_file)

    pages_num = len(pdf_reader.pages)
    print(f'"{pdf_name}" has {pages_num} pages')

    txt_file = open(drive_texts_url + pdf_name + '.txt', 'a')
    for page_num in range(pages_num):
        text = pdf_reader.pages[page_num].extract_text()
        txt_file.write(text)
    txt_file.close()

    pdf_file.close()

texts_names = os.listdir(drive_texts_url[:-1])
print(f'\nКоличество доступных текстов: {len(texts_names)}')

"dialnet" has 15 pages
"dictionary" has 1443 pages
"evangelioc" has 402 pages
"manual" has 133 pages
"santa_clara" has 76 pages

Количество доступных текстов: 39986


### 4. Очистка и лемматизация текстов

In [None]:
import os

In [None]:
def replace_symbols(symbols, replacement, text):
    for symbol in symbols:
        text = text.replace(symbol, replacement)
    return text

def clean_text(text):
    # Lowering case:
    text = text.lower()

    # Removing punctuation and other extra symbols:
    text = replace_symbols(',.[]{}()=≈>≥<≤+‡-±−*&^%$#@¡"!~;:ː§/\|¿?«»•·ºª↑↓←→≠Ø½∞', '', text)
    text = text.replace('\xa0', '')
    text = text.replace('www', '')Г

    # Replacing newline symbols with spaces:
    text = text.replace('\n', ' ')
    for _ in range(5):
        text = text.replace('  ', ' ')
    if text[0] == ' ':
        text = text[1:]
    if text[-1] == ' ':
        text = text[:-1]

    # Replacing all numbers with masks:
    text = replace_symbols('0123456789', 'NUM', text)

    return text

def replace_morphemes(text, morphemes, marker, first_layer=False):
    if first_layer:
        for morpheme in morphemes:
            text = text.replace(morpheme + ' ', marker + ' ')
    else:
        for morpheme in morphemes:
            text = text.replace(morpheme + marker, marker)
    return text

def lemmatize_text(text):
    ###  Numerals  ###
    numeral_token = 'NUM'

    tens = ('chunka', 'pachak', 'waranqa', 'hunu') # 10, 100, 1000, 1000000
    for ten in tens:
        text = text.replace(ten + ' ', numeral_token + ' ')

    figures = ('chusaq', 'huk', 'iskay', 'kimsa', 'tawa', 'pichqa', 'suqta', 'qanchis', 'pusaq', 'isqun') # 0-9
    for figure in figures:
        text = text.replace(' ' + figure + ' ', ' ' + numeral_token + ' ')
        text = text.replace(figure + numeral_token, numeral_token)

    quantitative_suffixes = ('niyuq', 'yuq')
    for quantitative_suffix in quantitative_suffixes:
        for figure in figures:
            text = text.replace(figure + quantitative_suffix, numeral_token)

    ordinal_suffixes = ('ñaqin', 'ñiqin', 'kaq')
    for  ordinal_suffix in ordinal_suffixes:
        text = text.replace(' ' +  ordinal_suffix + ' ', ' ')

    ###   Verbs   ###
    infinitive_ending = 'y'
    marker = '~'

    # Interrogative to affirmative:
    interrogative_suffix = 'chu'
    text = text.replace(interrogative_suffix + ' ', ' ')

    # Conjunctive:
    simple_endings = ('yman', 'waq', 'nkiman', 'sunman', 'nman', 'sunchikman', 'chwan',
                      'nchikman', 'ykuman', 'waqchik', 'nkichikman', 'nkuman')
    complex_endings = ('yki', 'ykichik', 'ykiku', 'wanki', 'wankiku', # 'wan', <- not to mix
                       'wanchik', 'wanku', 'sunki', 'sunkichik', 'sunkiku')
    text = replace_morphemes(text, simple_endings + complex_endings, marker, True)

    ### --------- ###

    ###   Nouns   ###
    # All cases to nominative:
    cases_endings = ('p', 'pa', 'paq', 'manta', 'wan', 'ta', 'pi', 'kama', 'rayku', 'man', 'hina', 'pura')
    text = replace_morphemes(text, cases_endings, '', True)
    # Plural to singular:
    text = text.replace('kuna ', ' ')

    ### --------- ###

    # Indicative:
    future_endings = ('saq', 'nki', 'nqa', 'sun', 'sunchik', 'saqku', 'nkichik', 'nqaku')
    present_endings = ('ni', 'nki', 'n', 'nchik', 'yku', 'nkichik', 'nku')
    text = replace_morphemes(text, future_endings + present_endings, marker, True)

    past_narrative_continuous_suffixes = ('rqa', 'ra', 'sqa', 'chka')
    text = replace_morphemes(text, past_narrative_continuous_suffixes, marker)

    complex_suffix = 'q'
    text = text.replace(complex_suffix + ' ka' + marker + ' ', marker + ' ')
    text = text.replace(complex_suffix + ' ', marker + ' ')
    text = text.replace(complex_suffix + 'ku ', marker + ' ')

    # Imperative:
    ending = 'ychik'
    text = text.replace(ending + ' ', marker)

    # Verbs to infinitive:
    text = text.replace(marker, infinitive_ending)

    return text

In [None]:
drive_texts_url = 'drive/MyDrive/Quechua/Texts/'
drive_processed_texts_url = 'drive/MyDrive/Quechua/Processed_Texts/'

texts_names = os.listdir(drive_texts_url[:-1])
texts_names.sort()

for text_name in texts_names:
    f = open(drive_texts_url + text_name, 'r')
    text = f.read()
    f.close()

    text = clean_text(text)
    text = lemmatize_text(text)

    f = open(drive_processed_texts_url + text_name, 'w')
    f.write(text)
    f.close()

### 5. Сборка корпуса Кечуа

In [None]:
import os

In [None]:
drive_processed_texts_url = 'drive/MyDrive/Quechua/Processed_Texts/'

texts_names = os.listdir(drive_processed_texts_url[:-1])
texts_names.sort()

f_corpus = open('' + 'drive/MyDrive/Quechua/quechua_corpus.txt', 'w')

for text_name in texts_names:
    f_text = open(drive_processed_texts_url + text_name, 'r')
    text = f_text.read()
    f_corpus.write(text + '\n')
    f_text.close()

f_corpus.close()
print(f'В состав корпуса языка Кечуа вошло {len(texts_names)} текстов')

В состав корпуса языка Кечуа вошло 39986 текстов


### 6. Создание TF_IDF матрицы

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def create_tf_idf_matrix(corpus_path, min_df=1, use_idf=True):
    '''
    corpus_path - path to a corpus, where one line - one text

    min_df - the minimum times (or fraction of the texts) a word must occur in the corpus

    use_idf - flag of the usage of idf
    '''
    with open(corpus_path, 'r') as corpus_file:
        vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names_out()

In [5]:
drive_quechua_url = 'drive/MyDrive/Quechua/'

W, words_list = create_tf_idf_matrix('drive/MyDrive/Quechua/quechua_corpus.txt')

# Saving the TF-IDF matrix:
f = open(drive_quechua_url + 'Matrices/' + 'tf_idf.npy', 'wb')
np.save(f, W)
f.close()

# Saving the words list:
f = open(drive_quechua_url + 'quechua_words_list.txt', 'w')
f.write(' '.join(words_list))
f.close()

print(f'Размерность матрицы W: {W.shape[0]} x {W.shape[1]} (количество текстов x количество слов)\n')
print(f'Некоторые слова из полученного списка слов: {words_list[1000:1005]}')

Размерность матрицы W: 39986 x 325662 (количество текстов x количество слов)

Некоторые слова из полученного списка слов: ['abrirle' 'abrirme' 'abrirse' 'abrirá' 'abrirás']


### 7. SVD

In [6]:
from scipy.sparse.linalg import svds
import numpy as np

In [7]:
def apply_svd(W, k, output_folder):
    '''
    W - TF-IDF matrix
    k - the rank of the SVD (must be less than any dimension of W)
    '''
    # Apply the SVD function
    u, sigma, vt = svds(W, k)

    # Sorting singular values in descending order as function doesn't garantee it
    descending_order_of_inds = np.flip(np.argsort(sigma))
    u = u[:,descending_order_of_inds]
    vt = vt[descending_order_of_inds]
    sigma = sigma[descending_order_of_inds]

    # Checking that sizes are correct
    assert sigma.shape == (k,)
    assert vt.shape == (k, W.shape[1])
    assert u.shape == (W.shape[0], k)

    # Save the matrixes in folder (just in case)
    matrices_names = ('_sigma_vt.npy', '_sigma.npy', '_u.npy', '_vt.npy')
    matrices = (np.dot(np.diag(sigma), vt).T, sigma, u, vt)

    for matrix_name, matrix in zip(matrices_names, matrices):
        with open(output_folder + str(k) + matrix_name, 'wb') as f:
            np.save(f, matrix)

    return np.dot(np.diag(sigma), vt).T

In [8]:
drive_matrices_url = 'drive/MyDrive/Quechua/Matrices/'

vv = apply_svd(W, 2, drive_matrices_url)
print(f'Размерность матрицы vv: {vv.shape[0]} x {vv.shape[1]} (количество слов x ранг k)')

Размерность матрицы vv: 325662 x 2 (количество слов x ранг k)


### 8. Создание словаря

In [20]:
def create_dictionary(words_list, vv, output_file):
  dictionary = {}
  for word, vector in zip(words_list, vv):
    dictionary[word] = vector
  np.save(output_file, dictionary)
  return dictionary

In [21]:
drive_quechua_url = 'drive/MyDrive/Quechua/'

dictionary = create_dictionary(words_list[71:], vv[71:, :], drive_quechua_url + 'quechua_dictionary.npy')

example_keys = list(dictionary.keys())[1000:1010]
for example_key in example_keys:
    print(f'{example_key}: {dictionary[example_key]}')

absentismo: [-1.83713903e-07 -7.26586961e-08]
absey: [-6.12379677e-08 -2.42195654e-08]
absi: [-9.93152074e-06 -7.91817497e-06]
absida: [-0.00110513 -0.00109232]
absidal: [-1.83713903e-07 -7.26586961e-08]
absidatadrew: [-0.00036838 -0.00036411]
absidia: [-0.00071713 -0.0005388 ]
absinthe: [-0.00015453 -0.00013165]
absinthum: [-3.39388756e-04 -9.79294703e-05]
absintio: [-6.12379677e-08 -2.42195654e-08]
