In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import re
import pickle as pkl

In [2]:
data_path = '/content/drive/MyDrive/cmn-eng/cmn.txt'

with open(data_path, 'r', encoding = "utf-8") as f:
    lines = f.read().split('\n')

In [9]:
def preprocess_cn(sentence):
    """
    Lowercases a Chinese sentence and inserts a whitespace between two characters.
    Surrounds the split sentence with <START> and <END>.
    """
    # removes whitespaces from the beginning of a sentence and from the end of a sentence
    sentence = sentence.lower().strip()
    # removes redundant whitespaces among words
    sentence = re.sub(r"[' ']+", " ", sentence)
    sentence = sentence.strip()
    # inserts a whitespace in between two words
    sentence = " ".join(sentence)
    # attaches starting token and ending token
    sentence = "<START> " + sentence + " <END>"
    return sentence


def preprocess_eng(sentence):
    """
    Lowercases an English sentence and inserts a whitespace within 2 words or punctuations.
    Surrounds the split sentence with <START> and <END>
    """
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([,.!?\"'])", r" \1", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"[^a-zA-Z,.!?\"']", ' ', sentence)
    sentence = "<START> " + sentence + " <END>"
    return sentence


In [10]:
# regardless of source and target languages
seq_pairs = []

for line in lines:
    # ensures that the line loaded contains Chinese and English sentences
    if len(line.split('\t')) >= 3:
        eng_doc, cn_doc, _ = line.split('\t')
        eng_doc = preprocess_eng(eng_doc)
        en_doc = preprocess_cn(cn_doc)
        seq_pairs.append([eng_doc, en_doc])
    else:
        continue

In [11]:
seq_pairs[:10]

[['<START> hi . <END>', '<START> 嗨 。 <END>'],
 ['<START> hi . <END>', '<START> 你 好 。 <END>'],
 ['<START> run . <END>', '<START> 你 用 跑 的 。 <END>'],
 ['<START> stop ! <END>', '<START> 住 手 ！ <END>'],
 ['<START> wait ! <END>', '<START> 等 等 ！ <END>'],
 ['<START> wait ! <END>', '<START> 等 一 下 ！ <END>'],
 ['<START> begin . <END>', '<START> 开 始 ！ <END>'],
 ['<START> hello ! <END>', '<START> 你 好 。 <END>'],
 ['<START> i try . <END>', '<START> 我 试 试 。 <END>'],
 ['<START> i won ! <END>', '<START> 我 赢 了 。 <END>']]

In [14]:
# Save list seq_pairs to file
with open("/content/drive/MyDrive/cmn-eng/eng-cn.pkl", "wb") as f:
    pkl.dump(seq_pairs, f)

In [15]:
# Retrieve pickle file of sequence pairs
with open("/content/drive/MyDrive/cmn-eng/eng-cn.pkl", "rb") as f:
    seq_pairs = pkl.load(f)


# text corpora (source: English, target: Chinese)
src_docs = []
tgt_docs = []

src_tokens = []
tgt_tokens = []

for pair in seq_pairs:
    src_doc, tgt_doc = pair
    # English sentence
    src_docs.append(src_doc)
    # Chinese sentence
    tgt_docs.append(tgt_doc)

    # tokenisation
    for token in src_doc.split():
        if token not in src_tokens:
            src_tokens.append(token)
    for token in tgt_doc.split():
        if token not in tgt_tokens:
            tgt_tokens.append(token)

In [16]:
src_docs[:10]

['<START> hi . <END>',
 '<START> hi . <END>',
 '<START> run . <END>',
 '<START> stop ! <END>',
 '<START> wait ! <END>',
 '<START> wait ! <END>',
 '<START> begin . <END>',
 '<START> hello ! <END>',
 '<START> i try . <END>',
 '<START> i won ! <END>']

In [17]:
tgt_docs[:10]

['<START> 嗨 。 <END>',
 '<START> 你 好 。 <END>',
 '<START> 你 用 跑 的 。 <END>',
 '<START> 住 手 ！ <END>',
 '<START> 等 等 ！ <END>',
 '<START> 等 一 下 ！ <END>',
 '<START> 开 始 ！ <END>',
 '<START> 你 好 。 <END>',
 '<START> 我 试 试 。 <END>',
 '<START> 我 赢 了 。 <END>']

In [18]:
src_tokens[:10]

['<START>', 'hi', '.', '<END>', 'run', 'stop', '!', 'wait', 'begin', 'hello']

In [19]:
tgt_tokens[:10]

['<START>', '嗨', '。', '<END>', '你', '好', '用', '跑', '的', '住']