In [1]:
import re
import jieba

def build_vocab():
    file_path = 'data/jaychou_lyrics.txt'
    
    # 1. 清洗文本
    clean_sentences = []
    with open(file_path) as f:
        for line in f:
            line = line.replace('〖韩语Rap译文〗','')
            # 去除中文、英文、数字、部分标点符号外的其他字符
            line = re.sub(r'[^\u4e00-\u9fa5 a-zA-Z0-9!?,]', '', line)
            # 连续空格替换成1个
            line = re.sub(r'[ ]{2,}', '', line)
            # 去除两侧空格、换行
            line = line.strip()
            # 去除单字的行
            if len(line) <= 1:
                continue
    
            # 去除重复行
            if line not in clean_sentences:
                clean_sentences.append(line)

    # 2. 预料分词
    index_to_word, all_sentences = [], []

    for line in clean_sentences:
        words = jieba.lcut(line)
        all_sentences.append(words)
        for word in words:
            if word not in index_to_word:
                index_to_word.append(word)

    # 词到索引映射
    word_to_index = {word: idx for idx, word in enumerate(index_to_word)}
    # 词的数量
    word_count = len(index_to_word)
    # 句子索引表示
    corpus_idx = []
    for sentence in all_sentences:
        temp = []
        for word in sentence:
            temp.append(word_to_index[word])
        # 在每行歌词之间添加空格隔开
        temp.append(word_to_index[' '])
        corpus_idx.extend(temp)


    return index_to_word, word_to_index, word_count, corpus_idx

build_vocab()

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/z8/y_hn633943lgllk3gb3g61r00000gn/T/jieba.cache
Loading model cost 0.668 seconds.
Prefix dict has been built successfully.


(['想要',
  '有',
  '直升机',
  '和',
  '你',
  '飞到',
  '宇宙',
  '去',
  '融化',
  '在',
  '一起',
  '里',
  '我',
  '每天',
  '想想',
  '著',
  '这样',
  '的',
  '甜蜜',
  '让',
  '开始',
  '相信',
  '命运',
  '感谢',
  '地心引力',
  '碰到',
  '漂亮',
  '面红',
  '可爱',
  '女人',
  '温柔',
  '心疼',
  '透明',
  '感动',
  '坏坏',
  '疯狂',
  '乡',
  '如果说',
  '怀疑',
  ' ',
  '可以',
  '造句',
  '分离',
  '能够',
  '翻译',
  '如果',
  '这',
  '一切',
  '真的',
  '将',
  '寂寞',
  '封闭',
  '然后',
  '这里',
  '不',
  '限',
  '日期',
  '过去',
  '慢慢',
  '温习',
  '爱上你',
  '那场',
  '悲剧',
  '是',
  '完美',
  '演出',
  '一场',
  '戏',
  '宁愿',
  '心碎',
  '哭泣',
  '再',
  '狠狠',
  '忘记',
  '爱过',
  '证据',
  '晶莹',
  '泪滴',
  '闪烁',
  '成',
  '回忆',
  '伤人',
  '美丽',
  '完美主义',
  '太',
  '彻底',
  '连',
  '恨',
  '都',
  '难以',
  '下笔',
  '真心',
  '抽离',
  '写成',
  '日记',
  '像是',
  '默剧',
  '分手',
  '的话',
  '像',
  '语言',
  '暴力',
  '已',
  '无能为力',
  '提起',
  '决定',
  '中断',
  '熟悉',
  '周杰伦',
  '一步',
  '两步',
  '三步',
  '四步',
  '望',
  '著天',
  '看',
  '星星',
  '一颗',
  '两颗',
  '三颗',
  '四颗',
  '连成线',
  '乘著风',
  '游荡',
  '蓝天',
  '边',
  '一片',
 