# PDF Reader v02

## imports

In [1]:
%load_ext lab_black

In [6]:
import sys

sys.path.append("..")

In [11]:
import os
import re
import numpy as np
import pandas as pd
import spacy

from glob import glob
from pdfminer.high_level import extract_text
from types_ import *

# ignore warning
import warnings

warnings.filterwarnings(action="ignore")

In [8]:
nlp = spacy.load("en_core_web_sm")

## 01. Data Load

In [21]:
dir_path = "../data/en"
file_list = glob(f"{dir_path}/*.pdf")

## 02. Preprocessing

In [16]:
def clean_text(text):
    """기사 내용 전처리 함수
    Args:
        - text: str 형태의 텍스트
    Return:
        - text: 전처리된 텍스트"""
    # Common
    # 개행문자 제거
    text = re.sub("\n", " ", text)
    text = re.sub("\v", " ", text)
    text = re.sub("\f", " ", text)
    # E-mail 제거#
    text = re.sub("([\w\d.]+@[\w\d.]+)", "", text)
    text = re.sub("([\w\d.]+@)", "", text)
    # 괄호 안 제거#
    text = re.sub("<[\w\s\d‘’=/·~:&,`]+>", "", text)
    text = re.sub("\([\w\s\d‘’=/·~:&,`]+\)", "", text)
    text = re.sub("\[[\w\s\d‘’=/·~:&,`]+\]", "", text)
    text = re.sub("【[\w\s\d‘’=/·~:&,`]+】", "", text)
    # 전화번호 제거#
    text = re.sub("(\d{2,3})-(\d{3,4}-\d{4})", "", text)  # 전화번호
    text = re.sub("(\d{3,4}-\d{4})", "", text)  # 전화번호
    # 홈페이지 주소 제거#
    text = re.sub("(www.\w.+)", "", text)
    text = re.sub("(.\w+.com)", "", text)
    text = re.sub("(.\w+.co.kr)", "", text)
    text = re.sub("(.\w+.go.kr)", "", text)
    # 기자 이름 제거#
    text = re.sub("/\w+[=·\w@]+\w+\s[=·\w@]+", "", text)
    text = re.sub("\w{2,4}\s기자", "", text)
    # 한자 제거#
    text = re.sub("[\u2E80-\u2EFF\u3400-\u4DBF\u4E00-\u9FBF\uF900]+", "", text)
    # 특수기호 제거#
    text = re.sub("[◇#/▶▲◆■●△①②③★○◎▽=▷☞◀ⓒ□?㈜♠☎]", "", text)
    # 따옴표 제거#
    text = re.sub("[\"'”“‘’]", "", text)
    # 2안_숫자제거#
    # text = regex.sub('[0-9]+',"",text)
    text = " ".join(text.split())
    return text

## 03. Extract Text from PDF

In [17]:
def pdf_to_text(file_path: str) -> List[str]:
    # 1. pdf -> text
    text = extract_text(file_path)
    # 2. sentence split
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    # 3. text cleaning
    sentences = [clean_text(sent) for sent in sentences]
    avg_sent_len = np.mean([len(sent) for sent in sentences])

    # 3. extract sentences
    sentences = [sent for sent in sentences if len(sent) > avg_sent_len]
    return sentences

In [28]:
%%time

sample = file_list[2]
sents = pdf_to_text(sample)

CPU times: user 1.16 s, sys: 2.27 ms, total: 1.16 s
Wall time: 1.16 s


In [29]:
len(sents)

182