In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

Used https://github.com/binga/fastai_notes/tree/master/experiments/notebooks/lang_models and https://github.com/sgugger/Deep-Learning/blob/master/Building%20a%20French%20LM.ipynb as starting points

In [2]:
from fastai.text import *
import html

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('..')/'data/lm/wikimedia/french/french_wiki/'

In [4]:
LM_PATH=Path('..')/'data/lm/models/french/'
LM_PATH.mkdir(exist_ok=True)

## Standarize format

In [5]:
LANG_FILENAMES = [str(f) for f in PATH.rglob("*/*")]

In [6]:
len(LANG_FILENAMES), LANG_FILENAMES[:5]

(3928,
 ['../data/lm/wikimedia/french/french_wiki/AI/wiki_08',
  '../data/lm/wikimedia/french/french_wiki/AI/wiki_65',
  '../data/lm/wikimedia/french/french_wiki/AI/wiki_52',
  '../data/lm/wikimedia/french/french_wiki/AI/wiki_69',
  '../data/lm/wikimedia/french/french_wiki/AI/wiki_40'])

In [7]:
LANG_TEXT = []
for fn in tqdm(LANG_FILENAMES):
    for line in open(fn, encoding='utf8'):
        LANG_TEXT.append(json.loads(line))
        
LANG_TEXT = pd.DataFrame(LANG_TEXT)

100%|██████████| 3928/3928 [00:28<00:00, 138.87it/s]


In [8]:
LANG_TEXT.head()

Unnamed: 0,id,text,title,url
0,417579,Élisabeth Woodville\n\nÉlisabeth Woodville (au...,Élisabeth Woodville,https://fr.wikipedia.org/wiki?curid=417579
1,417601,Macrocystis pyrifera\n\nMacrocystis pyrifera e...,Macrocystis pyrifera,https://fr.wikipedia.org/wiki?curid=417601
2,417602,Goémon\n\nLe goémon (orthographié aussi goëmon...,Goémon,https://fr.wikipedia.org/wiki?curid=417602
3,417611,Grímsey\n\nGrímsey est une petite île islandai...,Grímsey,https://fr.wikipedia.org/wiki?curid=417611
4,417614,Circassie\n\nLa Circassie est une région histo...,Circassie,https://fr.wikipedia.org/wiki?curid=417614


In [9]:
LANG_TEXT.to_csv(PATH/'wiki_fr.csv')

In [10]:
# Getting rid of the title name in the text field
def split_title_from_text(text):
    words = text.split("\n\n")
    if len(words) >= 2:
        return ''.join(words[1:])
    else:
        return ''.join(words)
    
LANG_TEXT['text'] = LANG_TEXT['text'].apply(lambda x: split_title_from_text(x))

In [11]:
LANG_TEXT.head()

Unnamed: 0,id,text,title,url
0,417579,"Élisabeth Woodville (aussi écrit Wydville, Wyd...",Élisabeth Woodville,https://fr.wikipedia.org/wiki?curid=417579
1,417601,Macrocystis pyrifera est une espèce d'algues b...,Macrocystis pyrifera,https://fr.wikipedia.org/wiki?curid=417601
2,417602,Le goémon (orthographié aussi goëmon) ou herbe...,Goémon,https://fr.wikipedia.org/wiki?curid=417602
3,417611,Grímsey est une petite île islandaise située à...,Grímsey,https://fr.wikipedia.org/wiki?curid=417611
4,417614,La Circassie est une région historique située ...,Circassie,https://fr.wikipedia.org/wiki?curid=417614


In [12]:
LANG_TEXT.to_csv(PATH/'wiki_fr1.csv', header=False, index=False)

Sorting the articles by length and keeping the first million.

In [13]:
LANG_TEXT = pd.read_csv(PATH/'wiki_fr1.csv', header=None)

In [14]:
LANG_TEXT = LANG_TEXT.assign(length = 0)

In [15]:
LANG_TEXT.columns = ['id', 'text', 'title', 'url', 'length']

In [16]:
LANG_TEXT = LANG_TEXT.assign(labels = 0).pipe(lambda x: x[['labels', 'text', 'length']])

In [17]:
LANG_TEXT['length'] = LANG_TEXT['text'].str.len()

In [18]:
LANG_TEXT.head()

Unnamed: 0,labels,text,length
0,0,"Élisabeth Woodville (aussi écrit Wydville, Wyd...",17833.0
1,0,Macrocystis pyrifera est une espèce d'algues b...,3830.0
2,0,Le goémon (orthographié aussi goëmon) ou herbe...,712.0
3,0,Grímsey est une petite île islandaise située à...,986.0
4,0,La Circassie est une région historique située ...,1663.0


In [19]:
LANG_TEXT = LANG_TEXT.sort_values(by=['length'], ascending=False)

In [20]:
LANG_TEXT.to_csv(PATH/'wiki_fr1.csv', header=False, index=False)

In [21]:
LANG_TEXT = LANG_TEXT[LANG_TEXT['length'] > 100]

"A note to those folks building langage models: there's no reason to go beyond 100 million tokens" JH http://forums.fast.ai/t/language-model-zoo-gorilla/14623/17

In [22]:
LANG_TEXT = LANG_TEXT.iloc[0:1000000]

Splitting 10% for validation.

In [23]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(LANG_TEXT.pipe(lambda x: x[['labels', 'text']]), test_size=0.1)

In [24]:
trn_texts.to_csv(PATH/'train.csv', header=False, index=False)
val_texts.to_csv(PATH/'valid.csv', header=False, index=False)

## Language model tokens

In [25]:
chunksize = 5000

In [26]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [27]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)
    tok = Tokenizer.proc_all_mp(partition_by_cores(texts), lang='fr')
    return tok, list(labels)

def get_all(df, name, n_lbls=1):
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        #optionally save the partial tokens instead of regrouping them in one big array.
        np.save(PATH/f'{name}_tok{i}.npy', tok_)
        #tok += tok_;
        #labels += labels_
    return tok, labels

In [28]:
df_trn = pd.read_csv(PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(PATH/'valid.csv', header=None, chunksize=chunksize)

In [None]:
get_all(df_trn,'trn',1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


In [None]:
get_all(df_val,'val',1)

## itos

In [None]:
def count_them_all(names):
    cnt = Counter()
    for name in names:
        for file in PATH.glob(f'tmp/{name}_tok*'):
            tok = np.load(file)
            cnt_tok = Counter(word for sent in tok for word in sent)
            cnt += cnt_tok
    return cnt

In [None]:
cnt = count_them_all(['trn'])

In [None]:
cnt.most_common(25)

In [None]:
max_vocab = 60000
min_freq = 5

In [None]:
itos = [o for o,c in cnt.most_common(max_vocab) if c > min_freq]
itos.insert(0,'_pad_')
itos.insert(0,'_unk_')

In [None]:
len(itos)

In [None]:
stoi = collections.defaultdict(int,{s:i for (i,s) in enumerate(itos)})

Numericalize each partial file.

In [None]:
def numericalize(name):
    results = []
    for file in tqdm(PATH.glob(f'tmp/{name}_tok*')):
        tok = np.load(file)
        results.append(np.array([[stoi[word] for word in sent] for sent in tok]))
    return np.concatenate(results)

In [None]:
trn_ids = numericalize('trn')
np.save(PATH/'tmp/trn_ids.npy', trn_ids)

In [None]:
val_ids = numericalize('val')
np.save(PATH/'tmp/trn_ids.npy', val_ids)

In [None]:
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))