# Prepare  data for trainning Polish Roberta model


Get raw text from different sources and concat in one big data file. 

Usefull shell commands:


Move files to another directory,  where isbn's are in the file list (wolne lektury non polish)

```
cat wolne_lektury_non_polish_isbn.txt | xargs -I{} sh -c "mv *'{}'* ./non_polish/;" 
```


Cat all text files and instert new line between each text

```sh
find *content.txt | xargs -I{} sh -c "cat '{}'; echo ''" > corpus_[type]_[date].txt

```

Take 11768022 first lines form splited wikipedia file
```
head -11768022 corpus_wiki_2020-02-13.txt > corpus_wiki_2020-02-13_sample.txt
```

In [None]:

import json
from pathlib import Path
from glob import glob
import os
from concurrent.futures import ProcessPoolExecutor
from itertools import chain
import nltk
import re
from tqdm import tqdm 

import text_utils as tu


## Prepare wikipedia data


Download wikipedia data and extract it with wikiextractor

Download data from https://dumps.wikimedia.org/plwiki/20200301/

```
mkdir wiki_dump
cd wiki_dump
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream1.xml-p1p169750.bz2
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream2.xml-p169751p510662.bz2
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream3.xml-p510663p1056310.bz2
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream4.xml-p1056311p1831508.bz2
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream5.xml-p1831509p3070393.bz2
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream6.xml-p3070394p4570393.bz2
wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream6.xml-p4570394p4727706.bz2
```



```
#train.txt
plwiki-20200301-pages-articles-multistream1.xml-p1p169750
plwiki-20200301-pages-articles-multistream2.xml-p169751p510662
plwiki-20200301-pages-articles-multistream3.xml-p510663p1056310
plwiki-20200301-pages-articles-multistream4.xml-p1056311p1831508
plwiki-20200301-pages-articles-multistream5.xml-p1831509p3070393
plwiki-20200301-pages-articles-multistream6.xml-p3070394p4570393
```

```
#eval.txt
plwiki-20200301-pages-articles-multistream6.xml-p4570394p4727706
```

```
cd data/wiki_dump
cat train.txt | xargs -I@  python ../../libs/wikiextractor/WikiExtractor.py @ --bytes=100M --json --output="./train/@"
```

Get files with content (depend of corpus you need: train, eval, all), process each json file and save in txt, separete articles by new line

In [None]:

def process_wiki_line(line, min_len=0):
    try:
        doc = json.loads(line)
        txt = re.sub("\s+", " ", doc["text"])
        if len(txt)< min_len:
            return '' #return empty if shorter then min_len
        return txt
    except:
        # print(f"Could not parse line \n{line}\n")
        return ''

Choose which files you need

In [None]:

#type='train'
#type='eval'
type='all'
wiki_dump_folder = f"./data/wiki_dump/*/**" if type=='all' else f"./data/wiki_dump/{type}/**"
courpus_raw_path=f'./data/corpus_raw/corpus_wikipedia_2020-03-01_{type}.txt'
wiki_json_files = [f for f in glob(wiki_dump_folder, recursive=True) if os.path.isfile(f)]

print(courpus_raw_path)



Read files, process json and save

In [None]:

print("output_path")
with open(courpus_raw_path, 'w+') as output_file:
    print(courpus_raw_path)
    for json_line in tqdm(wiki_json_files):
        
        tot_len = tu.get_num_lines(json_line)
        print(f'process - {json_line} lines={tot_len}')

        with open(json_line) as f:
            text=''
            
            for line in tqdm(f,total=tu.get_num_lines(json_line)):
                text=process_wiki_line(line, min_len=450)
                #print(text[0:20])
                if text.strip()!='':
                    output_file.write(text)
                    # put new line of the end of the article
                    output_file.write('\n\n')   


### wikipedia split lines

Do sentence tokenization and save each sentence in new line, add blank line between wiki aritcles




In [None]:

p = Path(courpus_raw_path) #'./data/corpus_raw/corpus_wikipedia_2020-03-01_{all,train,eval}.txt'
corpus_line_path = f"{p.with_suffix('')}_lines.txt"

print(f"in file={courpus_raw_path}\nout file={corpus_line_path}")


stats, vl, pl= tu.corpus_process_sentence(courpus_raw_path, corpus_line_path, split_each_line_as_doc = False, check_valid_sentence= False, check_lang_sentence=False)


## Prepare book corpus

Read book dataset and normalize line splitiing. The textfile has '\n' in middle of the sentence. It is not necessary if you have proper file.

Input: concatenated book textfile
Output: file with removed new lines in the middle of the sentence. 

Run once!!

In [None]:
#input_path
corpus_book_raw='./data/corpus_raw/corpus_books_2020_02_24.txt'

p = Path(corpus_book_raw)
print(p.with_suffix(''))
#output_path
corpus_book_fix = f"{p.with_suffix('')}_fix.txt"

print(corpus_book_fix)

In [None]:


# try to guess which regexp will be good enough :)
# reg = re.compile('(?<!\.)(\n)(?=[a-zA-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ])',re.MULTILINE )
# reg = re.compile('(?<!\.)(\n)(?=[a-ząćęłńóśźż])',re.MULTILINE )
#reg = re.compile('(?<!\.\n)(\n)(?=[a-ząćęłńóśźż])',re.MULTILINE )

# remove line breaks in the middle of the sentence
reg = re.compile('(?<=[A-Za-ząćęłńóśźż,—-])(?<!\.)(\n)(?=[a-ząćęłńóśźż])',re.MULTILINE )

#replace it by space
rep_lines=' '

# replace many dots in lines with one dot and line break
# https://regex101.com/r/qCTEPu/1
reg_dots = re.compile(r"^[\.-]([\.\s]*\n)",re.MULTILINE)
rep_dots='\n'

# https://pymotw.com/3/mmap/#regular-expressions

line_buff = 10007# 10007 # this is prime number, next is  10009  10037 
N = 0 

def save_buffer2file(output_file, text):

    # proces and write
    replace_text = reg.sub(rep_lines, text)
    #clean lines with only one character (dots, etc)
    replace_text = reg_dots.sub(rep_dots, replace_text)

    output_file.write(replace_text)



t0=dt.datetime.now()
with open(corpus_book_fix, 'w+') as output_file:
    with open(corpus_book_raw) as f:

        text=''
        for line in tqdm(f,total=tu.get_num_lines(input_path)):
            # get block of file (line_buff) lines and replace 
            if N<line_buff:
                # glue lines
                text+=line
                N+=1
            else:
                save_buffer2file(output_file, text)
                
                text=''
                N=0
                
        # for the rest of the file proces and write
        if N>0:
            save_buffer2file(output_file, text)
            

t1=dt.datetime.now()  
print(f'Done. Takes={t1-t0}')  

### Build book line corpus file

In [None]:


p = Path(corpus_book_fix) 
corpus_book_lines = f"{p.with_suffix('')}_lines.txt"

print(f"in file={corpus_book_fix}\nout file={corpus_book_lines}")


stats, vl, pl =tu.corpus_process_sentence(corpus_book_fix, corpus_book_lines, split_each_line_as_doc = False, check_valid_sentence= False, check_lang_sentence=False)

## Prepare Oscar corpus

Download dataset: 
[Polish part Oscar corpus](https://traces1.inria.fr/oscar/files/Compressed/pl_dedup.txt.gz) (pl_dedup.txt.gz ~19GB)

```
mv pl_dedup.txt.gz oscar_pl_dedup.txt.gz
gunzip -k oscar_pl_dedup.txt.gz
cd oscar_pl_dedup

head -n 30MB pl_dedup.txt > corpus_oscar_2020-04-10_30M.txt
```




Run in console docker image with krnnt pos tagger

```sh
docker run -p 9003:9003 -it djstrong/krnnt:1.0.0
```

In [None]:
corpus_oscar_raw = "./data/corpus_raw/corpus_oscar_100k.txt"

p = Path(corpus_oscar_raw)
corpus_oscar_lines = f"{p.with_suffix('')}_lines.txt"

print(f"in file={corpus_oscar_raw}\nout file={corpus_oscar_lines}")

stats, vl, pl = tu.corpus_process_sentence(
    corpus_oscar_raw,
    corpus_oscar_lines,
    split_each_line_as_doc=True,
    check_valid_sentence=True,
    check_lang_sentence=True,
    max_sentence_length=700,
)