# PDF Reader v02

## imports

In [1]:
# %load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import os
import re
import numpy as np
import pandas as pd
import spacy
import kss

from glob import glob
from pdfminer.high_level import extract_text
from types_ import *

# ignore warning
import warnings

warnings.filterwarnings(action="ignore")

In [4]:
from konlpy.tag import Kkma

In [5]:
nlp = spacy.load("en_core_web_sm")

## 01. Data Load

In [6]:
dir_path = "../data/ko/raw"
file_list = glob(f"{dir_path}/*.pdf")

## 02. Preprocessing

In [12]:
def clean_text(text, remove_num=True):
    """기사 내용 전처리 함수
    Args:
        - text: str 형태의 텍스트
    Return:
        - text: 전처리된 텍스트"""
    # Common
    # 개행문자 제거
    text = re.sub("\n", " ", text)
    text = re.sub("\v", " ", text)
    text = re.sub("\f", " ", text)
    # E-mail 제거#
    text = re.sub("([\w\d.]+@[\w\d.]+)", "", text)
    text = re.sub("([\w\d.]+@)", "", text)
    # 괄호 안 제거#
    text = re.sub("<[\w\s\d‘’=/·~:&,`]+>", "", text)
    text = re.sub("\([\w\s\d‘’=/·~:&,`]+\)", "", text)
    text = re.sub("\[[\w\s\d‘’=/·~:&,`]+\]", "", text)
    text = re.sub("【[\w\s\d‘’=/·~:&,`]+】", "", text)
    # 전화번호 제거#
    text = re.sub("(\d{2,3})-(\d{3,4}-\d{4})", "", text)  # 전화번호
    text = re.sub("(\d{3,4}-\d{4})", "", text)  # 전화번호
    # 홈페이지 주소 제거#
    text = re.sub("(www.\w.+)", "", text)
    text = re.sub("(.\w+.com)", "", text)
    text = re.sub("(.\w+.co.kr)", "", text)
    text = re.sub("(.\w+.go.kr)", "", text)
    # 기자 이름 제거#
    text = re.sub("/\w+[=·\w@]+\w+\s[=·\w@]+", "", text)
    text = re.sub("\w{2,4}\s기자", "", text)
    # 한자 제거#
    text = re.sub("[\u2E80-\u2EFF\u3400-\u4DBF\u4E00-\u9FBF\uF900]+", "", text)
    # 특수기호 제거#
    text = re.sub("[◇#/▶▲◆■●△①②③★○◎▽=▷☞◀ⓒ□?㈜♠☎]", "", text)
    # 따옴표 제거#
    text = re.sub("[\"'”“‘’]", "", text)
    if remove_num:
        # 2안_숫자제거#
        text = re.sub('[0-9]+',"",text)
    text = " ".join(text.split())
    return text

## 03. Extract Text from PDF

In [15]:
def pdf_to_text(file_path: str, language: str) -> List[str]:
    # 1. pdf -> text
    text = extract_text(file_path)
    # 2. sentence split
    if language == 'en':
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
        sentences = [clean_text(sent) for sent in sentences]
        avg_sent_len = np.mean([len(sent) for sent in sentences])
        sentences = [sent for sent in sentences if len(sent) > avg_sent_len]
    else:
        sentences = text.split('. ')
        sentences = [clean_text(sent) for sent in sentences]
        sent_lens = [len(sent) for sent in sentences]
        max_sent_len = np.quantile(sent_lens, q=0.75)
        avg_sent_len = np.mean(sent_lens)
        sentences = [sent for sent in sentences if avg_sent_len < len(sent) < max_sent_len]
    
    return sentences

In [16]:
%%time

sample = file_list[1]
sents = pdf_to_text(sample, language='ko')

Wall time: 17.5 s


In [9]:
len(sents)

281

In [8]:
a = list(range(100))

In [10]:
np.quantile(a, q=0.75)

74.25