In [1]:
import nltk
from nltk.stem.isri import ISRIStemmer
import unicodedata
from utils import DarijaBPETokenizer

# Loading the Data

In [2]:
with open("./data/music_data.txt", encoding='utf-8') as f:
    corpus = f.readlines()

len(corpus)

115259

In [3]:
corpus

['[refrain]\n',
 'jwanati ana ga3 mabkaw kaykifouni\n',
 '3dyani 3dyani ga3 makyhemouni\n',
 "dima dima khouk la costa f'tenue\n",
 "w sata 3ajbani b'lacoste gha foutouni\n",
 'dakchi lach dima ghatchoufna\n',
 'dima darbine lacoste 3chrani darbine lacoste\n',
 "wel 7etta b'survette lacoste galou mazzikti maddourch\n",
 "mosi9ti metlou9a fel porsche, mimti sm7ili f'la poste\n",
 "wakha 7aslin f'la poste 3chrani darbine lacoste\n",
 '\n',
 '[post-refrain]\n',
 'dakchi li f9lbi rah khaaznou\n',
 'dakchi li f9lbi rah khaaznou\n',
 '3douya bin 3inia waznou\n',
 '3douya bin 3inia waznou\n',
 'swlou  9lbi chkoun li 7akmou\n',
 'swlou  9lbi chkoun li 7akmou\n',
 'ghir nsayna magad 3la dnoub\n',
 '3la dnoub, 3la dnoub\n',
 '\n',
 '[couplet 1]\n',
 '7titk dakar tle3ti 9amar, tle3ti onouta ya khali\n',
 '3emri sem7ili 3dyani 7egrouni zen9a hia li gha tfari\n',
 "ghettithoum b'ghtaya tel3ou ghi 3ettaya wanari wanari wanari\n",
 "mama la lgitini sekrane f zan9a, ghir sm7ili w hezzi b'hbali\n",
 'l

In [4]:
corpus[0]

'[refrain]\n'

# Preprocessing

In [5]:
# removing all new lines and double points 
corpus = [ x  for x in corpus if not ("[" in x and "]" in x)]
print(len(corpus))
corpus[0:4]

105237


['jwanati ana ga3 mabkaw kaykifouni\n',
 '3dyani 3dyani ga3 makyhemouni\n',
 "dima dima khouk la costa f'tenue\n",
 "w sata 3ajbani b'lacoste gha foutouni\n"]

# Unique Characters

In [6]:
corpus_chars = set()
for sent in corpus:
    for ch in sent:
        try:
            unicodedata.name(ch)
            corpus_chars.add(ch)
        except:
            continue
        
corpus_chars = list(corpus_chars)

In [7]:
print(corpus_chars[:])
print(len(corpus_chars))

[']', '”', 'ڨ', 'o', '6', '\u200b', '.', 'h', 'ٍ', 'ة', '*', 'ć', 'ر', 'ê', '🥺', '∞', 'َ', 'š', '4', 'j', '،', 'ļ', '2', '\u200e', 'é', 'à', '8', 'ḥ', 'آ', 'æ', '#', 'گ', 'ن', 'د', 'œ', 'ٰ', 's', '7', 'ú', 'ì', 'w', 'ء', 'ـ', 'ب', 'v', 'å', 'c', ')', '‚', 'ö', 'd', '—', 'ل', 'ň', 'ٌ', 'ِ', 'م', ' ', 'ڤ', 'ı', 'a', 'f', 'إ', 'ò', 'ó', 'ˌ', 'q', '9', '0', ';', '„', '@', '>', 'ô', '=', '…', 'ً', '?', 'z', '|', 'ز', 'ث', 'ü', "'", 'g', 'أ', '̂', '̀', '§', 'ä', 'ë', 'i', 'ى', '!', '\u200a', ':', 'ْ', 'خ', 'ح', 'ڢ', 'í', '✨', 'â', 'l', 'ط', 'е', 'ø', '؛', 'ĺ', 'چ', '°', '–', '3', 'ك', 'ف', '\u2005', '̧', '[', 'ص', 'u', '×', '}', 'ę', '‘', '¡', '{', 'è', 'ɣ', 'ض', 'ذ', 't', 'ق', '²', ',', 'ç', 'õ', 'ڭ', '5', '1', 'ؤ', 'ù', 'ع', 'ķ', 'ï', 'غ', '+', '-', '%', 'پ', '$', '(', 'á', 'ا', 'e', '’', '؟', 'm', '£', 'y', 'ō', '⁉', 'ه', 'x', 'ی', '€', 'p', 'ت', '"', '/', 'ش', '\xa0', '“', 'î', 'û', 'ã', 'ئ', 'ñ', 'س', 'ي', 'ظ', '&', 'k', 'ّ', 'b', 'r', 'ʼ', '©', '»', 'ُ', '«', '¿', 'و', '_', '\u205f', '

# BPE Tokenization

In [8]:
tokenizer = DarijaBPETokenizer(corpus)

In [9]:
len(tokenizer.train_vocab)

117516

In [10]:
len(tokenizer.vocab)

194

In [11]:
tokenizer.bp_encode(max_vocab=200)
tokenizer.train_vocab

defaultdict(int,
            {'<SOS> j w a n a t i<EOS>': 6,
             '<SOS> a n a<EOS>': 2831,
             '<SOS> g a 3 <EOS>': 2374,
             '<SOS>m a b k a w <EOS>': 3,
             '<SOS> k a y k i f o u n i<EOS>': 3,
             '<SOS> 3 d y a n i<EOS>': 47,
             '<SOS>m a k y h e m o u n i<EOS>': 3,
             '<SOS> d i m a<EOS>': 1169,
             '<SOS> k h o u k <EOS>': 78,
             '<SOS>l a<EOS>': 16572,
             '<SOS> c o s t a<EOS>': 10,
             "<SOS> f ' t e n u e <EOS>": 3,
             '<SOS> w <EOS>': 18398,
             '<SOS> s a t a<EOS>': 328,
             '<SOS> 3 a j b a n i<EOS>': 6,
             "<SOS>b ' l a c o s t e <EOS>": 3,
             '<SOS> g h a<EOS>': 3753,
             '<SOS> f o u t o u n i<EOS>': 5,
             '<SOS> d a k ch i<EOS>': 362,
             '<SOS>l a ch <EOS>': 164,
             '<SOS> g h a t ch o u f n a<EOS>': 10,
             '<SOS> d a r b i n e <EOS>': 59,
             '<SOS>l a c o s t e <

In [12]:
len(tokenizer.vocab)

200

In [13]:
tokenizer.vocab

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<EOS>',
 '<SOS>',
 '<SOS>b',
 '<SOS>l',
 '<SOS>m',
 '=',
 '>',
 '?',
 '@',
 '[',
 ']',
 '_',
 'a',
 'a<EOS>',
 'b',
 'c',
 'ch',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'i<EOS>',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '¡',
 '£',
 '§',
 '©',
 '«',
 '°',
 '²',
 '»',
 '¿',
 '×',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 'ø',
 'ù',
 'ú',
 'û',
 'ü',
 'ć',
 'č',
 'ę',
 'ı',
 'ķ',
 'ĺ',
 'ļ',
 'ň',
 'ō',
 'œ',
 'š',
 'ɣ',
 'ʼ',
 'ˌ',
 '̀',
 '̂',
 '̧',
 'е',
 '،',
 '؛',
 '؟',
 'ء',
 'آ',
 'أ',
 'ؤ',
 'إ',
 'ئ',
 'ا',
 'ب',
 'ة',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ـ',
 'ف',
 'ق',
 'ك',
 'ل',
 'م',


In [15]:
tokenizer.save_vocab()