## Text preprocessing

In [1]:
# English
# clear punctuation
from string import punctuation
import numpy as np

f = open('train.en', 'r')
en_sentences = []

for line in f:
    current = " ".join("" if i in punctuation else i for i in line.lower().split())
    en_sentences.append(current.split())
f.close()

In [2]:
# count appearances
en_current = np.hstack(en_sentences)
en_unq, en_cnt = np.unique(en_current, return_counts=True)
en = {}
for i in range(len(en_unq)):
    en[en_unq[i]] = en_cnt[i]

In [3]:
from tqdm import tqdm_notebook

# preprocess sentences with <start>, <end> and <unk>
min_count = 5
en_sentences_prep = []
for i in tqdm_notebook(range(len(en_sentences))):
    current = en_sentences[i]
    current = " ".join("<unk>" if en[i] < min_count else i for i in current)
    current = '<start> ' + current + ' <end>'
    en_sentences_prep.append(current.split())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/3961179 [00:00<?, ?it/s]

In [5]:
len(en_sentences), len(en_sentences_prep), np.sum(en_cnt[en_cnt < min_count]) / np.sum(en_cnt) * 100

(3961179, 3961179, 0.01411177137677382)

In [6]:
# writing to _prep file
f = open('train_prep.en', 'w')
for i in range(len(en_sentences_prep)):
    current = en_sentences_prep[i]
    print(str(len(current)) + ' ' + " ".join(current), file=f)
f.close()    

In [9]:
def preprocess(line, lang, min_count):
    current = " ".join("" if i in punctuation else i for i in line.lower().split())
    current = current.split()
    leng = len(current) + 2
    st = len("".join(" " if (i not in lang.keys() or lang[i] < min_count) else "" for i in current))
    current = " ".join("<unk>" if (i not in lang.keys() or lang[i] < min_count) else i for i in current)
    current = '<start> ' + current + ' <end>'
    return leng, st, current

def preprocess_file(file, lang, min_count):
    f_in = open(file, 'r')
    curr = file.split('.')
    f_out = open(curr[0] + '_prep.' + curr[1], 'w')
    # to know <unk> ratio
    lleng, sst = 0, 0
    
    for line in f_in:
        leng, st, current = preprocess(line, lang, min_count)
        lleng += leng
        sst += st
        print(str(leng) + ' ' + current, file=f_out)
    
    f_in.close()
    f_out.close()
    return sst, lleng

In [12]:
st, leng = preprocess_file('valid.en', en, min_count)
st / leng * 100

0.016778899860474924

In [13]:
st, leng = preprocess_file('test.en', en, min_count)
st / leng * 100

0.01931894286744629

In [14]:
# Deutsch
# clear punctuation
f = open('train.de', 'r')
de_sentences = []

for line in f:
    current = " ".join("" if i in punctuation else i for i in line.lower().split())
    de_sentences.append(current.split())
f.close()

In [15]:
# count appearances
de_current = np.hstack(de_sentences)
de_unq, de_cnt = np.unique(de_current, return_counts=True)
de = {}
for i in range(len(de_unq)):
    de[de_unq[i]] = de_cnt[i]

In [16]:
# preprocess sentences with <start>, <end> and <unk>    
de_sentences_prep = []
for i in tqdm_notebook(range(len(de_sentences))):
    current = de_sentences[i]
    current = " ".join("<unk>" if de[i] < min_count else i for i in current)
    current = '<start> ' + current + ' <end>'
    de_sentences_prep.append(current.split())    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/3961179 [00:00<?, ?it/s]

In [17]:
len(de_sentences), len(de_sentences_prep), np.sum(de_cnt[de_cnt < min_count]) / np.sum(de_cnt) * 100

(3961179, 3961179, 0.0064189704799666675)

In [18]:
f = open('train_prep.de', 'w')
for i in range(len(de_sentences_prep)):
    current = de_sentences_prep[i]
    print(str(len(current)) + ' ' + " ".join(current), file=f)
f.close()

In [19]:
st, leng = preprocess_file('valid.de', de, min_count)
st / leng * 100

0.00672508643063383

In [20]:
st, leng = preprocess_file('test.de', de, min_count)
st / leng * 100

0.9535634467345419

## Word2Vec

In [1]:
# pretrained embeddings

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [2]:
f = open('train_prep.en', 'r')
en_sentences = []
for line in f:
    current = line.split()[1:]
    en_sentences.append(current)
f.close()

In [3]:
len(en_sentences)

3961179

In [4]:
min_count, dmodel = 5, 256
en_model = Word2Vec(sentences=en_sentences, size=dmodel, min_count=min_count)

In [5]:
en_model.save("word2vec_en" + str(dmodel) + ".model")

In [6]:
f = open('train_prep.de', 'r')
de_sentences = []
for line in f:
    current = line.split()[1:]
    de_sentences.append(current)
f.close()

In [7]:
len(de_sentences)

3961179

In [8]:
de_model = Word2Vec(sentences=de_sentences, size=dmodel, min_count=min_count)

In [9]:
de_model.save("word2vec_de" + str(dmodel) + ".model")

In [10]:
len(de_model.wv.vocab.keys()), len(en_model.wv.vocab.keys())

(31797, 26034)