In [1]:
# encoding=utf-8
import json
import re
from xml.etree import ElementTree as ET
from xml.dom import minidom

In [35]:
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

In [36]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [37]:
def load_data():
    with open("data/rhymebooks.json") as output:
        rhymes = json.load(output)
    with open("data/TC2SC.json") as output:
        T2C = json.load(output)
    with open("data/kangxi.json") as output:
        Dict = json.load(output)
    with open("data/metric.name") as output:
        metric_names = [line.strip() for line in output]

    P_PING_unflattened = rhymes['平水韵'][0]
    Z_PING_unflattened = rhymes['平水韵'][1]
    P_PING = flatten(rhymes['平水韵'][0])
    Z_PING = flatten(rhymes['平水韵'][1])

    return rhymes, T2C, Dict, metric_names, P_PING_unflattened, Z_PING_unflattened, P_PING, Z_PING

In [57]:
def preprocess(filename):
    _, _, _, _, _, _, P_PING, Z_PING = load_data()
    with open(filename) as op:
        lines = op.readlines()
    lines = [re.sub(r'[，。, .]', "", line.strip()) for line in lines]
    title = lines[0]
    signed = lines[1]
    verse = lines[2:]
    yan = len(lines[2])
    isJue = True if len(lines) == 6 else False
    tokens = [token for token in flatten(verse) if token]
    tones  = []
    for t in tokens:
        if t in P_PING:
            if t in Z_PING:
                tones.append('1/0') 
            else:
                tones.append('1') 
        else:
            tones.append('0')
    return tokens,tones,title,signed,verse, yan, isJue

In [39]:
def xml_write(root, filepath):
    rough_string = ET.tostring(root, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    raw_str = reparsed.toprettyxml(indent='', newl="")
    # print(raw_str)
    file = open(filepath, 'w', encoding='utf-8')
    file.write('<?xml version="1.0" encoding="UTF-8"?>')
    file.write(raw_str)
    file.close()

In [78]:
def metricsAnalytics(tokens, tones, yan, isJue):
    with open('data/m.json') as json_file:
        metric = json.load(json_file)
    _, _, _, metric_names, P_PING_unflattened, _, _, _ = load_data()
    met_type = ""
    
    if yan == 5:
        met_type += '五'
    else:
        met_type += '七'
        
    if isJue:
        met_type += '绝'
    else:
        met_type += '律'

#     count = 0 
    met = []
    met_real = []
    RBOOK = P_PING_unflattened
    if tones[1] == '1':
        met_type += '平起' 
    else:
        met_type += '仄起'
        
#     for k,v in enumerate(RBOOK):
#         if tokens[9] in v and tokens[19] in v:
#             if tones[4] == '1':
#                 met_type +='首句入韵'
#                 rhyme = "aaba"
#             else:
#                 met_type += '首句不入韵'
#                 rhyme = "abcb"
                

    if tones[yan-1] == '1':
        met_type +='首句入韵'
        if isJue:
            rhyme = "aaba"
        else:
            rhyme = "aabababa"
    else:
        met_type += '首句不入韵'
        if isJue:
            rhyme = "abcb"
        else:
            rhyme = "abcbabcb"

#             for k,(l1,l2) in enumerate(zip(tones,metric[met_type])):
#     #                     met = metric[num_metrics]
#                 if l1 == '1/0':
#                     pass
#     #               print("[检测到多音字",tokens[k],"]")
#                 elif l2 == '1/0':
#                     pass
#                 elif l1 != l2 and k%5 != 0:
#                     if l1 == '0':
#                         pass
# #                         print(tokens[k],"失配,建议：仄")
#                     else:
#                         pass
# #                         print(tokens[k],"失配,建议：平")
#                     count += 1
   
    for m in metric[met_type]:
        if m=='1' :
            met.append('+')
        else:
            met.append('-')
    
    for k,v in enumerate(tones):
        if v == metric[met_type][k] or v =='1/0':
            met_real.append(met[k])
        else:
            met_real.append('+' if v=='1' else '-')
    return met_type,met,met_real, rhyme

In [79]:
filelist = ["5jue","5lv","7jue","7lv"]
verse_type = ["五绝","五律","七绝","七律"]
begin_type = ["仄起","平起"]
begin_rhyme = ["首句不入韵","首句入韵"]
metrics = {}
head = ""
for file in filelist:
    with open("data/"+file+".m") as op:
        lines = op.readlines()
    if file[1:]=="jue":
        m_list = [re.sub(r'[，。, .]', "", line.strip()) for ind, line in enumerate(lines)  if ind%5!=0]
        if file[0]=="5":
            head = verse_type[0]
        else:
            head = verse_type[2]
    else:
        m_list = [re.sub(r'[，。, .]', "", line.strip()) for ind, line in enumerate(lines)  if ind%9!=0]
        if file[0]=="5":
            head = verse_type[1]
        else:
            head = verse_type[3]
    chunked_m = chunkIt(m_list,4)
    for i, chunk in enumerate(chunked_m):
        for j, sent in enumerate(chunk):
            new_chunk = ""
            for char in sent:
                if char == "平":
                    new_chunk += "1"
                else:
                    new_chunk += "0"
            chunked_m[i][j] = new_chunk
        

    yan = len(chunked_m[0])
    
    c = 0
    for i,t in enumerate(begin_type):
        for j, r in enumerate(begin_rhyme):
            metrics[head+t+r] = ''.join(chunked_m[c])
            c += 1

# metrics
obj = json.dumps(metrics, indent=4, sort_keys=True, ensure_ascii=False)
print(obj)
with open('data/m.json','w') as wr:
    wr.write(obj)

{
    "七律仄起首句不入韵": "00111001100011110011000110010011100110001111001100011001",
    "七律仄起首句入韵": "00110011100011110011000110010011100110001111001100011001",
    "七律平起首句不入韵": "11001100011001001110011000111100110001100100111001100011",
    "七律平起首句入韵": "11000110011001001110011000111100110001100100111001100011",
    "七绝仄起首句不入韵": "0011100110001111001100011001",
    "七绝仄起首句入韵": "0011001110001111001100011001",
    "七绝平起首句不入韵": "1100110001100100111001100011",
    "七绝平起首句入韵": "1100011001100100111001100011",
    "五律仄起首句不入韵": "0011011001111000001100110110011110000011",
    "五律仄起首句入韵": "0001111001111000001100110110011110000011",
    "五律平起首句不入韵": "1110000011001101100111100000110011011001",
    "五律平起首句入韵": "1100100011001101100111100000110011011001",
    "五绝仄起首句不入韵": "00110110011110000011",
    "五绝仄起首句入韵": "00011110011110000011",
    "五绝平起首句不入韵": "11100000110011011001",
    "五绝平起首句入韵": "11001000110011011001"
}


In [89]:

def build_tei_object(filenames, metrics):
    """
    :param raw: List of raw verses.
    :return: TEI: ElementTree object.
    """
    
    #  Following https://teibyexample.org/examples/TBED04v00.htm
    TEI = ET.Element("TEI")
    TEI.attrib = {"xmlns": "http://www.tei-c.org/ns/1.0"}

    teiHeader = ET.SubElement(TEI, "teiHeader")
    fileDesc = ET.SubElement(teiHeader, "fileDesc")
    titleStmt = ET.SubElement(fileDesc, "titleStmt")
    top_title = ET.SubElement(titleStmt, "title")
    top_title.text = "格律诗"
    publicationStmt = ET.SubElement(fileDesc, "publicationStmt")
    top_p = ET.SubElement(publicationStmt, "p")
    top_p.text = "Text Technology"
    sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
    top_p = ET.SubElement(sourceDesc, "p")
    top_p.text = "-DFW-"

    encodingDesc = ET.SubElement(teiHeader, "encodingDesc")
    metDecl = ET.SubElement(encodingDesc, "metDecl")
    metDecl.attrib = {"pattern": "(\+|\-)+"}  # 填格律进去
    metSym = ET.SubElement(metDecl, "metSym")  # 平
    metSym.attrib = {"value": "+"}
    metSym.text = "metrical promimence"
    metSym = ET.SubElement(metDecl, "metSym")  # 仄
    metSym.attrib = {"value": "-"}
    metSym.text = "metrical non-promimence"
#     metSym = ET.SubElement(metDecl, "metSym")  # 中
#     metSym.attrib = {"value": "~"}
#     metSym.text = "metrical promimence or non-promimence"
#     metSym = ET.SubElement(metDecl, "metSym")  # 音部
#     metSym.attrib = {"value": "｜"}
#     metSym.text = "foot boundary"
#     metSym = ET.SubElement(metDecl, "metSym")  # 格律
#     metSym.attrib = {"value": "/"}
#     metSym.text = "metrical line boundary"

    text = ET.SubElement(TEI, "text")
    body = ET.SubElement(text, "body")
    top_lg = ET.SubElement(body, "lg")
    
    for fn in filenames:
        lg = ET.SubElement(top_lg, "lg")
        lg.attrib = {
            "type": "poem",
        }
        Tokens,Tones,Title,Signed,Verse, Yan, isJue = preprocess(fn)
        Met_type,Met,Met_real, Rhyme = metricsAnalytics(Tokens, Tones,Yan, isJue)
        head = ET.SubElement(lg, "head")
        title = ET.SubElement(head, "title")
        title.text = Title
        llg = ET.SubElement(lg, "lg")
        llg.attrib = {
            "type": Met_type,
            "rhyme": Rhyme
        }
        for i,line in enumerate(Verse):
            l = ET.SubElement(llg, "l")
            met = Met[0+Yan*i:Yan+Yan*i]
            real_met = Met_real[0+Yan*i:Yan+Yan*i]
            if met != real_met:
                l.attrib = {
                "met": ''.join(met),
                "real": ''.join(real_met),
                "rhyme": Rhyme[i]
            }
            else:
                l.attrib = {
                    "met":''.join(met),
                    "rhyme": Rhyme[i]
                }


            l.text = line[:-1]
            rrhyme = ET.SubElement(l, "rhyme")
            rrhyme.text = line[-1]


        signed = ET.SubElement(lg, "signed")
        signed.text = Signed  
    
    

    rough_string = ET.tostring(TEI, 'utf-8')
    reparsed = minidom.parseString(rough_string)
#     mydata = ET.tostring(data)
#     myfile = open("items2.xml", "w")
#     myfile.write(mydata)
    with open('output.xml',"w") as output:
            output.write(reparsed.toprettyxml(indent="\t"))
    print(reparsed.toprettyxml(indent="\t"))
    
    return TEI


In [90]:
root = "data/verse"
build_tei_object([root+"1.raw",root+"2.raw",root+"3.raw",root+"4.raw",root+"5.raw",root+"6.raw",root+"7.raw",root+"8.raw",root+"9.raw"],metrics)

<?xml version="1.0" ?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
	<teiHeader>
		<fileDesc>
			<titleStmt>
				<title>格律诗</title>
			</titleStmt>
			<publicationStmt>
				<p>Text Technology</p>
			</publicationStmt>
			<sourceDesc>
				<p>-DFW-</p>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<metDecl pattern="(\+|\-)+">
				<metSym value="+">metrical promimence</metSym>
				<metSym value="-">metrical non-promimence</metSym>
			</metDecl>
		</encodingDesc>
	</teiHeader>
	<text>
		<body>
			<lg>
				<lg type="poem">
					<head>
						<title>登鹳雀楼</title>
					</head>
					<lg rhyme="abcb" type="五绝仄起首句不入韵">
						<l met="--++-" rhyme="a">
							白日依山
							<rhyme>尽</rhyme>
						</l>
						<l met="++--+" rhyme="b">
							黄河入海
							<rhyme>流</rhyme>
						</l>
						<l met="+++--" real="-++--" rhyme="c">
							欲穷千里
							<rhyme>目</rhyme>
						</l>
						<l met="---++" rhyme="b">
							更上一层
							<rhyme>楼</rhyme>
						</l>
					</lg>
					<signed>王之涣</signed>
				</lg>
				<lg

<Element 'TEI' at 0x11ba99278>