In [1]:
import os
import random
import shutil
import sys
import zipfile

import pandas as pd
import numpy as np
import wget

In [2]:
os.environ['KAGGLE_USERNAME'] = "indenbomdmitry"
os.environ['KAGGLE_KEY'] = "d71056824e379aa1756815ce2a658476"
import kaggle

In [3]:
def clear_directory(path_to_dir: str):
    if os.path.exists(path_to_dir):
        shutil.rmtree(path_to_dir)
    os.makedirs(path_to_dir)

In [4]:
def download_dataset_from_kaggle(kaggle_dataset_path: str, out_directory: str):
    clear_directory(out_directory)
    kaggle.api.dataset_download_files(kaggle_dataset_path,
                                      path=out_directory,
                                      unzip=True,
                                      quiet=False)

In [5]:
def bar_progress(current, total, width=80):
    progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
    # Don't use print() as it will print in new line every time.
    sys.stdout.write("\r" + progress_message)
    sys.stdout.flush()

def download_dataset_on_link(url, out_directory):
    clear_directory(out_directory)
    filename = wget.download(url,
                             out=out_directory,
                             bar=bar_progress)
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(out_directory)
    os.remove(filename)

In [6]:
class WordChecker:
    def __init__(self, symbols: str, max_length):
        self.alphabet = set(symbols)
        self.max_len = max_length

    def __call__(self, word: str) -> bool:
        return self.alphabet.union(word) == self.alphabet and len(word) <= self.max_len

In [7]:
def add_sequence_to_lexicon(lexicon: list, text: str,  acceptable_symbols: str, max_length: int, avg_length: int):
    word_checker = WordChecker(acceptable_symbols, max_length)
    seq = ""
    seq_length = 0
    for word in text.split():
        if not word_checker(word):
            continue
        if seq != "" and len(seq) + len(word) > seq_length:
            lexicon.append(seq)
            seq = ""
        if seq == "":
            seq_length = min(np.random.poisson(avg_length), max_length)
            seq = word
        else:
            seq += ' ' + word
    if seq != "":
        lexicon.append(seq)

    return lexicon

In [8]:
def write_lexicon(lexicon: list, path_to_file, mode='w'):
    with open(path_to_file, mode, encoding="utf-8") as the_file:
        the_file.write('\n'.join(lexicon))

#### Preparing lexicon of indian english names for handwritten names:

In [9]:
max_output_length = 34
avg_output_length = 7
lexicon_size = 15000
alphabet = " !\"'()*+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]_|}’№"
path_to_dataset = "../data/lexicon-english-names"

In [10]:
download_dataset_from_kaggle("kanchitank/indian-names-male-and-female",
                             path_to_dataset)

Downloading indian-names-male-and-female.zip to ../data/lexicon-english-names


100%|██████████| 539k/539k [00:00<00:00, 5.80MB/s]







In [11]:
data = pd.read_csv(path_to_dataset + "/Names_dataset.csv", encoding="utf-8")["name"]
# Deleting empty rows
data = data.dropna().to_list()

In [12]:
texts = []
for sequence in data:
    add_sequence_to_lexicon(texts, str(sequence).upper(), alphabet, max_output_length, avg_output_length)

print("Number of texts is " + str(len(texts)))

Number of texts is 138436


In [13]:
sub_texts = random.sample(texts, min(lexicon_size, len(texts)))
print("Number of subset of texts is " + str(len(sub_texts)))

Number of subset of texts is 15000


In [14]:
write_lexicon(sub_texts, path_to_dataset + "/names_lexicon.txt")

#### Preparing lexicon of russian news for cyrillic texts:

In [15]:
max_output_length = 25
avg_output_length = 7
lexicon_size = 15000
alphabet = " !\"%(),-./0123456789:;?[]«»АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё"
path_to_dataset = "../data/russian-news"

In [16]:
download_dataset_from_kaggle("vfomenko/russian-news-2020",
                             path_to_dataset)

Downloading russian-news-2020.zip to ../data/russian-news


100%|██████████| 19.9M/19.9M [00:01<00:00, 18.9MB/s]





In [17]:
data = pd.read_csv(path_to_dataset + "/news.csv", encoding="utf-8")["text"]
# Deleting empty rows
data = data.dropna().to_list()

In [18]:
texts = []
for sequence in data:
    add_sequence_to_lexicon(texts, str(sequence), alphabet, max_output_length, avg_output_length)

print("Number of texts is " + str(len(texts)))

Number of texts is 4261487


In [19]:
sub_texts = random.sample(texts, min(lexicon_size, len(texts)))
print("Number of subset of texts is " + str(len(sub_texts)))

Number of subset of texts is 15000


In [20]:
write_lexicon(sub_texts, path_to_dataset + "/news.txt")

#### Preparing lexicon for Peter's notes:

In [21]:
max_output_length = 70
avg_output_length = 28
lexicon_size = 10000
alphabet = " ()+/0123456789[]abdefghiklmnoprstu|×ǂабвгдежзийклмнопрстуфхцчшщъыьэюяѣ–⊕⊗"
path_to_dataset = "../data/Peter's_notes"

In [22]:
data = pd.read_csv(path_to_dataset + "/notes_train.csv", encoding="utf-8")["text"]
# Deleting empty rows
data = data.dropna().to_list()

In [23]:
texts = []
words = ' '.join(data).split()
random.shuffle(words)
words = ' '.join(words)
add_sequence_to_lexicon(texts, words, alphabet, max_output_length, avg_output_length)

print("Number of texts is " + str(len(texts)))

Number of texts is 5211


In [24]:
sub_texts = random.sample(texts, min(lexicon_size, len(texts)))
print("Number of subset of texts is " + str(len(sub_texts)))

Number of subset of texts is 5211


In [25]:
write_lexicon(sub_texts, path_to_dataset + "/notes_train.txt")