# Data Preprocessing

## imports 

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import re
import glob
import pickle
import platform  # os check
import numpy as np
import pandas as pd

from collections import defaultdict
from tqdm import tqdm

from utils import get_tokenizer
from utils.types_ import *

## 1. Data Load

- 전체 데이터 한번에 묶어서 처리하기

In [4]:
data_paths = glob.glob("../data/origins/*/*.txt")
data_paths = sorted(data_paths)
data_dict = defaultdict(list)

for data_path in tqdm(data_paths):
    keyword = data_path.split("/")[-2]
    with open(data_path, "rb") as fp:
        data_dict[keyword].extend(pickle.load(fp))

100%|██████████| 10/10 [00:00<00:00, 37.50it/s]


In [5]:
# data_dict['탄핵'][-1]

In [6]:
def data_merge(dir_path: str):
    data_paths = glob.glob(f"{dir_path}/*/*.txt")
    data_paths = sorted(data_paths)
    data_dict = defaultdict(list)

    for data_path in tqdm(data_paths):
        keyword = data_path.split("/")[-2]
        with open(data_path, "rb") as fp:
            data_dict[keyword].extend(pickle.load(fp))

    return data_dict

In [7]:
dir_path = "../data/origins"

data_dict = data_merge(dir_path)

100%|██████████| 10/10 [00:00<00:00, 36.03it/s]


## 2. Data Preprocessing

- 제목, 기사에 대하여 불필요한 텍스트 정제 진행

In [8]:
def clean_text(text):
    """기사 내용 전처리 함수
    Args:
        - text: str 형태의 텍스트
    Return:
        - text: 전처리된 텍스트"""
    # Common
    text = re.sub("\n", " ", text)
    # E-mail 제거#
    text = re.sub("([\w\d.]+@[\w\d.]+)", "", text)
    text = re.sub("([\w\d.]+@)", "", text)
    # 괄호 안 제거#
    text = re.sub("<[\w\s\d‘’=/·~:&,`]+>", "", text)
    text = re.sub("\([\w\s\d‘’=/·~:&,`]+\)", "", text)
    text = re.sub("\[[\w\s\d‘’=/·~:&,`]+\]", "", text)
    text = re.sub("【[\w\s\d‘’=/·~:&,`]+】", "", text)
    text = re.sub("\(.*\)", "", text)
    text = re.sub("\[.*\]", "", text)

    # 전화번호 제거#
    text = re.sub("(\d{2,3})-(\d{3,4}-\d{4})", "", text)  # 전화번호
    text = re.sub("(\d{3,4}-\d{4})", "", text)  # 전화번호
    # 홈페이지 주소 제거#
    text = re.sub("(https:)", "", text)
    text = re.sub("(www.\w.+)", "", text)
    text = re.sub("(.\w+.com)", "", text)
    text = re.sub("(.\w+.co.kr)", "", text)
    text = re.sub("(.\w+.go.kr)", "", text)
    # 기자 이름 제거#
    text = re.sub("/\w+[=·\w@]+\w+\s[=·\w@]+", "", text)
    text = re.sub("\w{2,4}\s기자", "", text)
    # 한자 제거#
    text = re.sub("[\u2E80-\u2EFF\u3400-\u4DBF\u4E00-\u9FBF\uF900]+", "", text)
    # 특수기호 제거#
    text = re.sub("[◇#/▶▲◆■●△①②③★○◎▽=▷☞◀ⓒ□?㈜♠☎]", "", text)
    # 따옴표 제거#
    text = re.sub("[\"'”“‘’]", "", text)
    # 2안_숫자제거#
    # text = regex.sub('[0-9]+',"",text)

    text = " ".join(text.split())
    return text

In [9]:
clean_data = []
for key, articles in data_dict.items():
    for article in tqdm(articles):
        press, cat, title, content = article
        if press in ["조선일보", "동아일보", "경향신문", "한겨레"]:
            title = clean_text(title)
            content = clean_text(content)

            clean_data.append((key, press, cat, title, content))

100%|██████████| 8018/8018 [00:03<00:00, 2475.75it/s]
100%|██████████| 3343/3343 [00:01<00:00, 3058.51it/s]
100%|██████████| 1756/1756 [00:00<00:00, 2977.08it/s]
100%|██████████| 9007/9007 [00:04<00:00, 2206.66it/s]
100%|██████████| 1392/1392 [00:00<00:00, 2542.62it/s]


In [10]:
# with open('../data/total_cleaned_data.txt', 'wb') as fp:
#     pickle.dump(clean_data, fp)

In [11]:
def data_clean(data_dict: Dict, press_list: List[str]) -> List[Tuple]:
    clean_data = []
    for key, articles in data_dict.items():
        for article in tqdm(articles, desc=f"{key} data_clean"):
            press, cat, title, content = article
            if press in press_list:
                title = clean_text(title)
                content = clean_text(content)

                clean_data.append((key, press, cat, title, content))

    return clean_data

In [12]:
press_list = ["조선일보", "동아일보", "경향신문", "한겨레"]
clean_data = data_clean(data_dict, press_list)

남북회담 data_clean: 100%|██████████| 8018/8018 [00:03<00:00, 2540.31it/s]
드루킹 data_clean: 100%|██████████| 3343/3343 [00:01<00:00, 3004.17it/s]
조국 data_clean: 100%|██████████| 1756/1756 [00:00<00:00, 2867.22it/s]
탄핵 data_clean: 100%|██████████| 9007/9007 [00:04<00:00, 2201.69it/s]
필리버스터 data_clean: 100%|██████████| 1392/1392 [00:00<00:00, 2552.92it/s]


## 3. Text Tokenization

In [32]:
def get_tokens(
    data: List[Tuple],
    save_dir: str,
    stopwords_path: str,
    tokenizer_name: str = "mecab",
) -> None:

    with open(stopwords_path, "r", encoding="utf-8") as f:
        stopwords = f.read().split("\n")

    tokenizer = get_tokenizer(tokenizer_name)
    nouns_data, token_data, token_pos_data = [], [], []
    for news in tqdm(data):
        keyword, press, category, title, content = news

        # tokenizer를 이용한 tokenizing
        # nouns
        title_nouns = tokenizer.nouns(title)
        content_nouns = tokenizer.nouns(content)
        # tokens & pos_tag
        title_tokens = tokenizer.pos(title)
        content_tokens = tokenizer.pos(content)

        # stopwords 적용
        title_nouns = [word for word in title_nouns if word not in stopwords]
        content_nouns = [word for word in content_nouns if word not in stopwords]
        title_morphs = [word for word, _ in title_tokens if word not in stopwords]
        content_morphs = [word for word, _ in content_tokens if word not in stopwords]
        title_tags = [
            f"{word}_{pos}" for word, pos in title_tokens if word not in stopwords
        ]
        content_tags = [
            f"{word}_{pos}" for word, pos in content_tokens if word not in stopwords
        ]

        # append lists
        nouns_data.append((keyword, press, category, title_nouns, content_nouns))
        token_data.append((keyword, press, category, title_morphs, content_morphs))
        token_pos_data.append((keyword, press, category, title_tags, content_tags))

    # save tokens
    with open(f"{save_dir}/nouns_total_data.txt", "wb") as fp:
        pickle.dump(nouns_data, fp)

    with open(f"{save_dir}/tokenized_pos_total_data.txt", "wb") as fp:
        pickle.dump(token_pos_data, fp)

    with open(f"{save_dir}/tokenized_total_data.txt", "wb") as fp:
        pickle.dump(token_data, fp)

    return None

In [33]:
save_dir = "../data/tokenized"
stopwords_path = "../data/stopwords/stopwords_kr.txt"
get_tokens(clean_data, save_dir, stopwords_path)

100%|██████████| 16804/16804 [03:55<00:00, 71.44it/s] 
