# Extract speakers and what they say from the book

By LongGang Pang

In [12]:
import os
from tqdm import tqdm_notebook

In [2]:
def get_conversations(sentence):
    '''get the start and end position of the conversation in the
    input sentence
    Returns:
      start_index: int, the starting index in the sentence
      end_index: int, the ending index in the sentence
      conversation: str, the conversation'''
    end_symbols = ['"', '“', '”']
    istart, iend = -1, -1
    talks = []  # 收集对话
    # get the start and end position for conversation
    for i in range(1, len(sentence)): 
        if (not istart == -1) and sentence[i] in end_symbols:
            iend = i
            conversation = {'istart': istart, 'iend': iend, 'talk': sentence[istart+1:iend]}
            talks.append(conversation)
            istart = -1
        if sentence[i-1] in [':', '：'] and sentence[i] in end_symbols:
            istart = i
    # get the context from where one can extract speaker
    contexts = []
    if len(talks):
        for i in range(len(talks)):
            if i == 0: 
                contexts.append(sentence[:talks[i]['istart']])
            else:
                contexts.append(sentence[talks[i-1]['iend']+1:talks[i]['istart']])
        # append the paragraph after the conversation if iend != len(sentence)
        if talks[-1]['iend'] != len(sentence):
            contexts.append(sentence[talks[-1]['iend']+1:])
        else:
            contexts.append(' ')
        # the situation is not considered if the speaker comes after the talk
        for i in range(len(talks)):
            talks[i]['context'] = contexts[i]
    return talks, contexts    


In [13]:
def extract_corpus(book_name="hongloumeng.txt", save_as="honglou.py"):
    fout = open(save_as, "w")  # 保存的文件，记录成python格式
    with open(book_name, "r") as fin:
        fout.write('#!/usr/bin/env python\n')
        fout.write('talks = [')
        for line in tqdm_notebook(fin.readlines()):
            talks, contexts = get_conversations(line.strip())
            if len(talks) > 0:
                for talk in talks: #print(talk, '|||\n')
                    fout.write(talk.__repr__())
                    fout.write(',\n')
        fout.write(']')
    fout.close()

In [14]:
extract_corpus()




In [15]:
from honglou import talks

In [18]:
print(talks[-5:])

[{'istart': 37, 'iend': 106, 'talk': '汝父年将半百，再无续室之意，且汝多病，年又极小，上无亲母教养，下无姊妹兄弟扶持，今依傍外祖母及舅氏姊妹去，正好减我顾盼之忧，何反云不往？', 'context': '那女学生黛玉，身体方愈，原不忍弃父而往，无奈他外祖母致意务去，且兼如海说：'}, {'istart': 555, 'iend': 571, 'talk': '刚才老太太还念呢，可巧就来了。', 'context': '且说黛玉自那日弃舟登岸时，便有荣国府打发了轿子并拉行李的车辆久候了．这林黛玉常听得母亲说过，他外祖母家与别家不同．他近日所见的这几个三等仆妇，吃穿用度，已是不凡了，何况今至其家．因此步步留心，时时在意，不肯轻易多说一句话，多行一步路，惟恐被人耻笑了他去．自上了轿，进入城中从纱窗向外瞧了一瞧，其街市之繁华，人烟之阜盛，自与别处不同．又行了半日，忽见街北蹲着两个大石狮子，三间兽头大门，门前列坐着十来个华冠丽服之人．正门却不开，只有东西两角门有人出入．正门之上有一匾，匾上大书"敕造宁国府"五个大字．黛玉想道：这必是外祖之长房了．想着，又往西行，不多远，照样也是三间大门，方是荣国府了．却不进正门，只进了西边角门．那轿夫抬进去，走了一射之地，将转弯时，便歇下退出去了．后面的婆子们已都下了轿，赶上前来．另换了三四个衣帽周全十七八岁的小厮上来，复抬起轿子．众婆子步下围随至一垂花门前落下．众小厮退出，众婆子上来打起轿帘，扶黛玉下轿．林黛玉扶着婆子的手，进了垂花门，两边是抄手游廊，当中是穿堂，当地放着一个紫檀架子大理石的大插屏．转过插屏，小小的三间厅，厅后就是后面的正房大院．正面五间上房，皆雕梁画栋，两边穿山游廊厢房，挂着各色鹦鹉，画眉等鸟雀．台矶之上，坐着几个穿红着绿的丫头，一见他们来了，便忙都笑迎上来，说：'}, {'istart': 592, 'iend': 599, 'talk': '林姑娘到了。', 'context': '于是三四人争着打起帘笼，一面听得人回话：'}, {'istart': 149, 'iend': 179, 'talk': '这是你大舅母，这是你二舅母，这是你先珠大哥的媳妇珠大嫂子。', 'context': '黛玉方进入房时，只见两个人搀着一位鬓发如银的老母迎上来，黛玉便知是他外祖母．方欲拜见时，早被