In [1]:
import re
import codecs
import emoji
import string
import collections
from operator import itemgetter

## 预处理
- 英文切词：把单词和标点紧密联系的切开！

- 中文： 分词，更便捷的处理是以字为单位切割。

### 英文文本清洗
- 缩略词更改
- 拼写校正
- 标点符号
- 符号替换
- 去除空格

In [2]:
def clean_text(text):
    """
    Clean text
    :param text: the string of text
    :return: text string after cleaning
    """
    # acronym
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"what\'s", "what is", text)
    text = re.sub(r"What\'s", "what is", text)
    text = re.sub(r"\'ve ", " have ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"I\'m", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e mail ", " email ", text)
    text = re.sub(r" e \- mail ", " email ", text)
    text = re.sub(r" e\-mail ", " email ", text)

    # spelling correction
    text = re.sub(r"ph\.d", "phd", text)
    text = re.sub(r"PhD", "phd", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" fb ", " facebook ", text)
    text = re.sub(r"facebooks", " facebook ", text)
    text = re.sub(r"facebooking", " facebook ", text)
    # text = re.sub(r" usa ", " america ", text)
    # text = re.sub(r" us ", " america ", text)
    # text = re.sub(r" u s ", " america ", text)
    # text = re.sub(r" U\.S\. ", " america ", text)
    text = re.sub(r" US ", " america ", text)
    text = re.sub(r" American ", " america ", text)
    text = re.sub(r" America ", " america ", text)
    text = re.sub(r" mbp ", " macbook-pro ", text)
    text = re.sub(r" mac ", " macbook ", text)
    text = re.sub(r"macbook pro", "macbook-pro", text)
    text = re.sub(r"macbook-pros", "macbook-pro", text)
    text = re.sub(r" 1 ", " one ", text)
    text = re.sub(r" 2 ", " two ", text)
    text = re.sub(r" 3 ", " three ", text)
    text = re.sub(r" 4 ", " four ", text)
    text = re.sub(r" 5 ", " five ", text)
    text = re.sub(r" 6 ", " six ", text)
    text = re.sub(r" 7 ", " seven ", text)
    text = re.sub(r" 8 ", " eight ", text)
    text = re.sub(r" 9 ", " nine ", text)
    text = re.sub(r"googling", " google ", text)
    text = re.sub(r"googled", " google ", text)
    text = re.sub(r"googleable", " google ", text)
    text = re.sub(r"googles", " google ", text)
    text = re.sub(r"dollars", " dollar ", text)

    # punctuation
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"-", " - ", text)
    text = re.sub(r"/", " / ", text)
    # text = re.sub(r"\\", " \ ", text)
    text = re.sub(r"=", " = ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\.", " . ", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\"", " \" ", text)
    text = re.sub(r"&", " & ", text)
    text = re.sub(r"\|", " | ", text)
    text = re.sub(r";", " ; ", text)
    text = re.sub(r"\(", " ( ", text)
    text = re.sub(r"\)", " ( ", text)

    # symbol replacement
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"\|", " or ", text)
    text = re.sub(r"=", " equal ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r"\$", " dollar ", text)

    # remove extra space
    text = ' '.join(text.split())

    return text


def clean_text_zh(text):
    """中文数据清洗"""
    # 去除空格
    text = re.sub(' ', '', text)
    # 去掉全角空白符，\u3000 是全角的空白符
    text = re.sub('\u3000', '', text)
    # 去掉 \xa0 是不间断空白符 &nbsp;
    text = re.sub('\xa0', '', text)
    # 去掉未识别的表情符号
    text = re.sub('<U+.*>', '', text)
    # 去除英文标点, 这应该放在最后
    text = text.translate(
        str.maketrans('', '', string.punctuation))
    return text

# 清除emoji
def filter_emoji(srcstr, restr=''):  
    """过滤emoji"""
    # 编译匹配表情的正则
    prog = emoji.get_emoji_regexp()
    return prog.sub(restr, srcstr) 

In [3]:
def is_remove(line):
    items = ["<url>", "<keywords>", "<speaker>", "<talkid>", "<translator", "<reviewer"]
    for part in items:
        if line.startswith(part):
            return True
    return False          

In [4]:
def remove_html_line(read_file, write_file):
    fw = open(write_file, "a+")
    with open(read_file, "r") as f:
        for line in f:
            r = is_remove(line)
            if r: 
                continue
            if line.startswith("<title>"):
                text = re.sub(r"<title>|</title>", "", line)
            elif line.startswith("<description>"):
                text = re.sub(r"<description>|</description>", "", line)
            else:
                text = line
            fw.write(text)
    fw.close()

In [5]:
en_file = "./data/en-zh/train.tags.en-zh.en"
zh_file = "./data/en-zh/train.tags.en-zh.zh"

en_tmp_file = "./data/en-zh/train.trg.en-zh.en.1"
zh_trg_file = "./data/en-zh/train.trg.en-zh.zh"

In [6]:
en_trg_file = "./data/en-zh/train.trg.en-zh.en"

In [7]:
remove_html_line(en_file, en_tmp_file)

In [8]:
remove_html_line(zh_file, zh_trg_file)

In [9]:
fw = open(en_trg_file, "a+")
with open(en_tmp_file, "r") as f:
    for line in f:
        text = clean_text(line)+"\n"
        fw.write(text)
fw.close()

In [10]:
'''
pt-tf-env) [dc@gz_6237_gpu en-zh]$ wc -l train.trg.en-zh.zh
213377 train.trg.en-zh.zh
(pt-tf-env) [dc@gz_6237_gpu en-zh]$ wc -l train.trg.en-zh.en
213377 train.trg.en-zh.en
'''

'\npt-tf-env) [dc@gz_6237_gpu en-zh]$ wc -l train.trg.en-zh.zh\n213377 train.trg.en-zh.zh\n(pt-tf-env) [dc@gz_6237_gpu en-zh]$ wc -l train.trg.en-zh.en\n213377 train.trg.en-zh.en\n'

- 中文数据集还需要做处理

In [12]:
zh_file = "./data/en-zh/train.trg.zh"

In [13]:
def preprocess_zh(f1, f2):
    fw = open(f2, "w")
    with open(f1, "r") as f:
        for line in f:
            line_li = [w.strip() for w in line.strip()]
            string = " ".join(line_li)
            fw.write(string+"\n")
    fw.close()
    print("done!")

In [14]:
preprocess_zh(zh_trg_file, zh_file)

done!


In [15]:
slist = [
    "开满鲜花的小路1️⃣",
    "背景音乐怎么不见了<U+1F643>",
    "2020，你好❤️",
    "甜甜<U+1F63B><U+1F34E>的声音作品 - 《他若开心，我便开心-《灵魂摆渡·黄泉》》",
    "<U+1F497>迎春花儿☘️浅滩的声音作品 -《新年祝福 | 愿新年的每一天都能温暖如初》",
    " 　　不是再见  　　顾城  　　我们告别了两年  　　告别的结果  　　总是相见  　　今夜，你真要走了  　　真的走了，不是再见  　　还需要什么？  　　手凉凉的，没有手绢",
    "一定要做开心的自己喔♥",
    "第1⃣7⃣课朗读",
    "张乙～落花生" 
]

In [16]:
for i in slist:
    r = re.sub('<U+.*>', '', i)
    # 去掉全角空白符，\u3000 是全角的空白符
    r = re.sub('\u3000', '', r)
    # 替换表情
    r = filter_emoji(r, restr='')
    print(filter_space(r))

NameError: name 'filter_space' is not defined

In [20]:
import re

In [21]:
s = """hehe.da,wo
我的 呵呵大
"""
re.sub(r"\n", "", s)

'hehe.da,wo我的 呵呵大'

In [28]:
def en_trans_to_zh(string):
    """英文标点转中文"""
    en_pun = u',.!?[]()<>"\''
    zh_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f, t in zip(en_pun, zh_pun)}
    return string.translate(table)

In [23]:
en_trans_to_zh(s)

'hehe。da，wo\n我的 呵呵大\n'

In [24]:
def is_contain_chinese(text):
    """
    判断字符串中是否包含中文
    :param text: 需要检测的字符串
    :return: 包含返回True， 不包含返回False
    """
    for ch in text:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

In [25]:
is_contain_chinese(s)

True

In [26]:
s2 = "we are famliy 123"

In [27]:
is_contain_chinese(s2)

False

In [33]:
?ord