# Data Preprocessing

In [46]:
raw_data_dir = "/home/peterr/macocu/Varieties/BCMS"
interim_data_dir = "/home/peterr/macocu/taskB/data/interim"
import os
import parse
import re
from transliterate import translit



In [47]:
all_files = os.listdir(raw_data_dir)
p = parse.compile("setimes.{lang1}-{lang2}.{lang}.txt")

all_relevant_files = sorted([i for i in all_files if p.parse(i) is not None])
all_relevant_files

['setimes.bs-hr.bs.txt',
 'setimes.bs-hr.hr.txt',
 'setimes.bs-sr.bs.txt',
 'setimes.bs-sr.sr.txt',
 'setimes.hr-sr.hr.txt',
 'setimes.hr-sr.sr.txt']

In [48]:
def is_delimiter(line:str) -> bool:
    searched_string = "{beginning} Southeast European Times {end}\n"
    ss = parse.compile(searched_string)

    return ss.parse(line) is not None
def extract_labels_and_text(filename: str) -> str:
    p = parse.compile("{path}/setimes.{lang1}-{lang2}.{lang}.txt")
    parse_dict = p.parse(filename)
    lang = parse_dict["lang"]
    prefix = f"__label__{lang}"
    content = ""

    block = ""
    with open(filename, "r") as f:
        for line in f.readlines():
            if not is_delimiter(line):
                line = line.replace("\n", " ")
                if lang == "sr":
                    line = translit(line, "sr", reversed=True)
                block += line
            else:
                content += f"{prefix} {block}\n"
                block = ""
        if block != "":
            content += f"{prefix} {block}\n"
            block = ""
    return content


In [60]:
with open(os.path.join(interim_data_dir, "all.fasttxt"), "w") as destination:
    for file in all_relevant_files:
        filename = os.path.join(raw_data_dir, file)
        to_write = extract_labels_and_text(filename)
        destination.write(to_write)

In [62]:
%%bash
cd ~/macocu/taskB/data/interim/
sort all.fasttxt | uniq | sort -R > all_randomized.fasttxt

In [64]:
%%bash
cd ~/macocu/taskB/data/interim
len=$(wc -l < all_randomized.fasttxt)
tenpercent=$(bc <<< "0.1*$len/1")
ninetypercent=$(bc <<< "$len - $tenpercent")
head -n $ninetypercent all_randomized.fasttxt > train.fasttxt
echo "wrote first $ninetypercent lines to train.fasttxt"
tail -n $tenpercent all_randomized.fasttxt > test.fasttxt
echo "wrote last $tenpercent lines to test.fasttxt"

wrote first 16673 lines to train.fasttxt
wrote last 1852 lines to test.fasttxt


# Introducing `fasttext`

In [65]:
import fasttext
train = os.path.join(interim_data_dir, "train.fasttxt")
test = os.path.join(interim_data_dir, "test.fasttxt")

model = fasttext.train_supervised(input=train, epoch=25)
model.test(test)

(1852, 0.9973002159827213, 0.9973002159827213)

In [66]:
model = fasttext.train_supervised(input=train, epoch=25, lr=0.7)
model.test(test)

(1852, 0.9967602591792657, 0.9967602591792657)

In [67]:
model = fasttext.train_supervised(input=train, epoch=50, lr=0.7)
model.test(test)

(1852, 0.9967602591792657, 0.9967602591792657)

In [68]:
model.save_model(os.path.join(
    "/home/peterr/macocu/taskB/data/models",
    "ftmodel_3.bin"
))

In [69]:
model.labels

['__label__hr', '__label__sr', '__label__bs']

In [70]:
model.predict(["Hriste Bože raspeti i sveti", "Pa da Miljacka mostove odnese", "Lijepa naša"])[0]

[['__label__sr'], ['__label__sr'], ['__label__hr']]

In [71]:
model.predict(["Lijepa naša"],)

([['__label__hr']], [array([0.6540654], dtype=float32)])