# PDF Reader v01

## imports

In [14]:
%load_ext lab_black

In [75]:
import os
import re
import numpy as np
import pandas as pd
import spacy

from glob import glob
from pdfminer.high_level import extract_text

# ignore warning
import warnings

warnings.filterwarnings(action="ignore")

In [40]:
nlp = spacy.load("en_core_web_sm")

# doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")

# for token in doc:
#     print(token, token.lemma, token.lemma_, token.pos_)

## 01. Data Load

In [19]:
dir_path = "../data/en"
file_list = glob(f"{dir_path}/*.pdf")

## 02. Preprocessing

In [104]:
def clean_text(text):
    """기사 내용 전처리 함수
    Args:
        - text: str 형태의 텍스트
    Return:
        - text: 전처리된 텍스트"""
    # Common
    # 개행문자 제거
    text = re.sub("\n", " ", text)
    text = re.sub("\v", " ", text)
    text = re.sub("\f", " ", text)
    # E-mail 제거#
    text = re.sub("([\w\d.]+@[\w\d.]+)", "", text)
    text = re.sub("([\w\d.]+@)", "", text)
    # 괄호 안 제거#
    text = re.sub("<[\w\s\d‘’=/·~:&,`]+>", "", text)
    text = re.sub("\([\w\s\d‘’=/·~:&,`]+\)", "", text)
    text = re.sub("\[[\w\s\d‘’=/·~:&,`]+\]", "", text)
    text = re.sub("【[\w\s\d‘’=/·~:&,`]+】", "", text)
    # 전화번호 제거#
    text = re.sub("(\d{2,3})-(\d{3,4}-\d{4})", "", text)  # 전화번호
    text = re.sub("(\d{3,4}-\d{4})", "", text)  # 전화번호
    # 홈페이지 주소 제거#
    text = re.sub("(www.\w.+)", "", text)
    text = re.sub("(.\w+.com)", "", text)
    text = re.sub("(.\w+.co.kr)", "", text)
    text = re.sub("(.\w+.go.kr)", "", text)
    # 기자 이름 제거#
    text = re.sub("/\w+[=·\w@]+\w+\s[=·\w@]+", "", text)
    text = re.sub("\w{2,4}\s기자", "", text)
    # 한자 제거#
    text = re.sub("[\u2E80-\u2EFF\u3400-\u4DBF\u4E00-\u9FBF\uF900]+", "", text)
    # 특수기호 제거#
    text = re.sub("[◇#/▶▲◆■●△①②③★○◎▽=▷☞◀ⓒ□?㈜♠☎]", "", text)
    # 따옴표 제거#
    text = re.sub("[\"'”“‘’]", "", text)
    # 2안_숫자제거#
    # text = regex.sub('[0-9]+',"",text)
    text = " ".join(text.split())
    return text

## 03. Extract Text from PDF

### 1) Sentence Split & Text Cleaning

In [105]:
# sentences = []
# doc = nlp(text)
# for sentence in doc.sents:
#     sentences.append(sentence.text)

# sentences = [clean_text(sent) for sent in sentences]
# sent_lens = [len(sent) for sent in sentences]

# avg_len = np.mean(sent_lens)
# sentences2 = [sent for sent in sentences if len(sent) > avg_len]
# # sentences2

### 2) create function

In [106]:
def get_sentences(text):
    # 1. sentence split
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    # 2. text cleaning
    sentences = [clean_text(sent) for sent in sentences]
    avg_sent_len = np.mean([len(sent) for sent in sentences])

    # 3. extract sentences
    sentences = [sent for sent in sentences if len(sent) > avg_sent_len]
    return sentences

In [107]:
sample = file_list[5]
text = extract_text(sample)

In [108]:
sents = get_sentences(text)

In [110]:
sents[:5]

['The 5G era New horizons for advanced electronics andpanies Contents Executive summary Introduction',
 'A comprehensive view of the 5G value proposition Archetypes of 5G IoT use cases',
 'Companies poised to win Myth versus reality in 5G IoT value capture 2',
 'Private-network opportunities 4 Opportunities for advancedpanies and industrials',
 'The 5G IoT module market The 5Gponent market 5 Strategic implications for advancedpanies and industrials']