<a href="https://colab.research.google.com/github/ByungjunKim/munzip/blob/main/%5BColab%5D%ED%95%9C%EA%B5%AD%EA%B3%A0%EC%A0%84%EC%A2%85%ED%95%A9DB_XML_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 한국고전종합DB XML Parsing
한국문집총간 XML : https://www.data.go.kr/data/3074298/fileData.do  
고전원문 XML : https://www.data.go.kr/data/15022432/fileData.do


In [None]:
# 필요 패키지 설치
!pip install -U mpire

In [None]:
import pandas as pd
import numpy as np
import lxml
# import xml.etree.ElementTree as ET
import lxml.etree as et
from tqdm import tqdm
import numpy as np
import re
from natsort import natsorted
import glob
import multiprocessing
from mpire import WorkerPool

### 문집총간 파일 github에서 가져오기(clone)

In [None]:
# munzip 폴더 확인
!git clone https://github.com/ByungjunKim/munzip

### 파일 목록 정리

In [None]:
# 샘플로 계원필경집과 고운집만 다룸
file_list = natsorted(glob.glob('./munzip/한국문집총간/*/*.xml'))
file_list

In [None]:
print(len(file_list))
print(len(file_list)-2) # 텍스트 존재 XML 파일 수

In [None]:
# 본문 xml 리스트
file_list_text = [f for f in file_list if re.match('ITKC\_MO\_.+\_\d+\.xml',f.split('/')[-1])]
len(file_list_text)

In [None]:
# 저서 메타 정보 xml 리스트
file_list_meta = natsorted(list(set(file_list) - set(file_list_text)))
len(file_list_meta)

### 파싱 테스트 (메타정보)

In [None]:
file_list_meta[0]

In [None]:
tree = et.parse(file_list_meta[0])

In [None]:
# id
tree.find('.//레벨1').attrib['id']

In [None]:
# 서명 (한자/한글)
[t.text for t in tree.find('.//레벨1//제목정보').findall('제목') if t.attrib['type'] in ['한자서명','한글서명']]

In [None]:
# 저자
print(tree.find('.//레벨1//저자//한글성명').text)
print(tree.find('.//레벨1//저자//한자성명').text)
print(tree.find('.//레벨1//저자//생년').text)
print(tree.find('.//레벨1//저자//생년').attrib['서기년'])
print(tree.find('.//레벨1//저자//몰년').attrib['서기년'])

In [None]:
tree.find('.//레벨1//저자//생년').text

In [None]:
# 간행년
tree.find('.//레벨1//원문간행년').text

### 실전 XML 파싱 (메타정보)

In [None]:
def get_Meta(xml):
    data = {}
    tree = et.parse(xml)
    data['id'] = tree.find('.//레벨1').attrib['id']
    titles = [t.text for t in tree.find('.//레벨1//제목정보').findall('제목') if t.attrib['type'] in ['한자서명','한글서명']]
    data['한자서명'] = titles[0]
    data['한글서명'] = titles[1]
    data['한글성명'] = tree.find('.//레벨1//저자//한글성명').text
    data['한자성명'] = tree.find('.//레벨1//저자//한자성명').text
    try:
        data['생년'] = tree.find('.//레벨1//저자//생년').text
    except:
        data['생년'] = None
    data['생년_서기'] = tree.find('.//레벨1//저자//생년').attrib['서기년']
    try:
        data['몰년'] = tree.find('.//레벨1//저자//몰년').text
    except:
        data['몰년'] = None
    data['몰년_서기'] = tree.find('.//레벨1//저자//몰년').attrib['서기년']
    data['원문간행년'] = tree.find('.//레벨1//원문간행년').text

    return data

In [None]:
get_Meta(file_list_meta[0])

In [None]:
# multiprocessing으로 CPU 여러대 활용 (주의 : windows는 안될 수 있음)
with WorkerPool(n_jobs=multiprocessing.cpu_count()) as pool:
    meta_list = pool.map(get_Meta, file_list_meta, progress_bar=True)

In [None]:
meta = pd.DataFrame(meta_list)
meta

### 파싱 테스트 (본문)

In [None]:
tree = et.parse(file_list_text[0])

In [None]:
# 레벨 1 id
tree.find('.//레벨1').attrib['id']

In [None]:
tree.find('.//레벨4').attrib['DCI']

In [None]:
# 레벨 4 id
tree.find('.//레벨4').attrib['id']

In [None]:
# 연계항목(이미지, 번역, 교감표점) id
tree.findall('.//레벨4//연계항목')

In [None]:
tree.findall('.//레벨4//연계항목')[0].attrib['type']

In [None]:
tree.findall('.//레벨4//연계항목')[1].attrib['type']

In [None]:
# 단락별 저자
print(tree.find('.//레벨4//저자//한글성명').text)
print(tree.find('.//레벨4//저자//한자성명').text)

In [None]:
tree.findall('.//레벨4//연계항목')[1].attrib['연계시작']

In [None]:
''.join(tree.find('.//레벨4//제목정보//제목').itertext())

In [None]:
# 권차 제목
''.join(tree.find('.//레벨2//제목정보//제목').itertext())

In [None]:
[''.join(t.itertext()) for t in tree.findall('.//레벨4//제목정보//제목')]

In [None]:
tree.find('.//레벨4//분류항목').findall('.//분류내용')

In [None]:
classification = [t.findall('.//분류내용') for t in tree.findall('.//레벨4//분류항목') if t.attrib['type']=='문체분류']
classification

In [None]:
print([c.text for l in classification for c in l if c.attrib['type']=='대분류'])
print([c.text for l in classification for c in l if c.attrib['type']=='중분류'])

In [None]:
tree.findall('.//레벨4//내용')

In [None]:
tree.findall('.//레벨4//내용')[0].find('단락').text

In [None]:
tree.findall('.//레벨4//내용')[0].findtext('단락')

In [None]:
tree.findall('.//레벨4//내용')[0].attrib

In [None]:
''.join(tree.findall('.//레벨4//내용')[0].find('단락').itertext())

In [None]:
# 개행(엔터,\n) 제거
re.sub('\n','',''.join(tree.findall('.//레벨4//내용')[0].find('단락').itertext()))

### 실전 XML 파싱 (본문)

In [None]:
def get_Text(xml):
    tree = et.parse(xml)
    # tree = ET.parse(xml)
    id = tree.find('.//레벨1').attrib['id']

    # 권차제목
    vol_title = ''.join(tree.find('.//레벨2//제목정보//제목').itertext())

    # 레벨 3 or 4 선택
    if tree.findall('.//레벨4')!=[]:
        # 기본정보
        lv4_list = tree.findall('.//레벨4')

        # 내용 부분
        lv4_content_list = tree.findall('.//레벨4//본문정보')

        # 연계항목
        # lv4_asso_list = tree.findall('.//레벨4//연계항목')

    else: #레벨3에 위치한 경우
        # print('lv3')
        # 기본정보
        lv4_list = tree.findall('.//레벨3')

        # 내용 부분
        lv4_content_list = tree.findall('.//레벨3//본문정보')

        # 연계항목
        # lv4_asso_list = tree.findall('.//레벨3//연계항목')

    # 자료ID
    tid = [lv4.attrib['id'] for lv4 in lv4_list]

    # DCI_s
    dci = [lv4.attrib['DCI'] for lv4 in lv4_list]

    # 저자
    author_ko = [lv4.find('.//저자//한글성명').text if lv4.find('.//저자//한글성명')!=None else None for lv4 in lv4_list]
    author_han = [lv4.find('.//저자//한자성명').text if lv4.find('.//저자//한자성명')!=None else None for lv4 in lv4_list]

    # 제목
    para_title = [''.join(lv4.find('.//제목정보//제목').itertext()) for lv4 in lv4_list]

    # 문체분류
    # classification = [t.findall('.//분류내용') for lv4 in lv4_list for t in lv4.find('.//분류항목') if t.attrib['type']=='문체분류']
    # main_class = [c.text for l in classification for c in l if c.attrib['type']=='대분류']
    # middle_class = [c.text for l in classification for c in l if c.attrib['type']=='중분류']
    try:
        main_class = [c.text for lv4 in lv4_list for c in lv4.find('.//분류항목') if c.attrib['type']=='대분류']
        middle_class = [c.text for lv4 in lv4_list for c in lv4.find('.//분류항목') if c.attrib['type']=='중분류']
    except TypeError:
        print(xml)
        main_class = []
        middle_class = []

    # 내용
    content = [re.sub('\n','',''.join(lv4_content.find('내용').itertext()).strip()) for lv4_content in lv4_content_list]

    # 내용 (원주 삭제)
    for lv4 in lv4_content_list:
        for comm in lv4.findall('.//원주'):
            comm.getparent().remove(comm)

    content_wo = [re.sub('\n','',''.join(lv4_content.find('내용').itertext()).strip()) for lv4_content in lv4_content_list]

    res = {'id':id, '권차제목': vol_title, 'tid':tid, 'dci':dci, '단락저자_한글성명':author_ko, '단락저자_한자성명':author_han,\
    '문체_대분류':main_class, '문체_중분류':middle_class,'단락제목': para_title ,'단락내용':content, '단락내용_원주삭제':content_wo}

    return res

In [None]:
with WorkerPool(n_jobs=multiprocessing.cpu_count()) as pool:
    text_list = pool.map(get_Text, file_list_text, progress_bar=True)

In [None]:
df = pd.DataFrame(text_list)
df

In [None]:
# id가 비어있는 행 제외
print(df[df['id'].map(lambda x:len(x)==0)])
# df = df[df['id'].map(lambda x:len(x)!=0)]
# main class 비어있는행 제외
print(df[df['문체_대분류'].map(lambda x:len(x)==0)])
df = df[df['문체_대분류'].map(lambda x:len(x)!=0)].reset_index(drop=True)

In [None]:
df[df['tid'].str.len()!=df['문체_대분류'].str.len()]

In [None]:
#  리스트(여러 단락)로 묶인걸 풀면서(하나의 단락씩) 행 확장
df_explode = df.explode(df.columns[2:].tolist(),ignore_index=True)
df_explode

In [None]:
# dci 중복행 확인
df_explode[df_explode.duplicated(subset=['dci'],keep=False)]

In [None]:
# dci 중복행 삭제
df_explode = df_explode.drop_duplicates(subset=['dci'],keep='first').reset_index(drop=True)

In [None]:
# 메타정보 테이블과 통합
df_explode = pd.merge(meta,df_explode,how='inner')

In [None]:
df_explode

In [None]:
# csv 저장
df_explode.to_csv('230825_한국문집총간.csv', index=None)
# excel 저장
df_explode.to_excel('230825_한국문집총간.xlsx', index=None)