In [1]:
import json

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path

In [2]:
def load(file):
    with open(file, 'r', encoding="utf-8") as f:
        return f.read()
        
def save(file, string):
    with open(file, 'w', encoding="utf-8") as f:
        f.write(string)
        
def load_jsonl(file):
    with open(file, 'r', encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

def save_jsonl(file, obj_list):
    jstrings = [json.dumps(obj, ensure_ascii=False) for obj in obj_list]
    save(
        file,
        '\n'.join(jstrings) + '\n'
        )

## Extract Data

In [None]:
def extract_data(html_art):
    metadata = json.loads(html_art["metadata"])[0]
    article_part = BeautifulSoup(html_art["article_part"], 'html.parser')

    paras = article_part.find("article").find_all("p")
    art = '\n'.join([p.text for p in paras])
    if not art:
        return None
    text = f"{metadata['headline']}\n\n{art}"

    return {
        "RTI_id": html_art["id"],
        "text": text,
        "url": metadata["url"],
        "headline": metadata["headline"],
        "articleSection": metadata["articleSection"],
        "datePublished": metadata["datePublished"],
    }

In [27]:
input_file = r"D:\winuser\Downloads\crawler\data\raw\articles_500-1000.jsonl"
html_arts = load_jsonl(input_file)

JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [21]:
art = extract_data(html_arts[1])
print()
print(art)

{'@context': 'http://schema.org', '@type': 'NewsArticle', 'thumbnailUrl': 'https://en-static.rti.org.tw/assets/thumbnails/2017/01/19/20170119000078M.jpg', 'url': 'https://en.rti.org.tw/news/view/id/61581', 'mainEntityOfPage': 'https://en.rti.org.tw/news/view/id/61581', 'headline': 'Former premier meets US congressmen at Capitol', 'articleSection': 'Taiwan News', 'datePublished': '2017-01-19T00:00:00+08:00', 'dateModified': '2017-01-19T00:00:00+08:00', 'keywords': '', 'image': {'@type': 'ImageObject', 'contentUrl': 'https://en-static.rti.org.tw/assets/thumbnails/2017/01/19/20170119000078M.jpg', 'url': 'https://en-static.rti.org.tw/assets/thumbnails/2017/01/19/20170119000078M.jpg', 'name': 'Former premier Yu Shyi-kun', 'height': 451, 'width': 800}, 'author': {'@type': 'Person', 'name': 'Editor'}, 'publisher': {'@type': 'Organization', 'name': '中央廣播電臺 RTI Radio Taiwan International', 'email': 'rtiwebt@rti.org.tw', 'url': 'https://www.rti.org.tw', 'sameAs': '', 'logo': {'@type': 'ImageObje

In [26]:
input_file = r"D:\winuser\Downloads\crawler\data\raw\articles_500-1000.jsonl"
out_path = r"D:\winuser\Downloads\crawler\data\extracted"

html_arts = load_jsonl(input_file)
arts = []
for html_art in tqdm(html_arts):
    art = extract_data(html_art)
    if art is not None:
        arts.append(art)

out_path = Path(out_path)
out_path.mkdir(parents=True, exist_ok=True)
file_name = Path(input_file).name

save_jsonl(
    out_path / file_name,
    arts
    )

100%|██████████| 5460/5460 [00:22<00:00, 247.34it/s]


## Merge datafiles

In [49]:
datapath = r"D:\winuser\Downloads\crawler\data\extracted"
out_file = r"D:\winuser\Downloads\crawler\data\RTL_Taiwan_EN_news_(partial).jsonl"

files = Path(datapath).glob("*")
data = {}
for f in files:
    arts = {art["RTI_id"]: art for art in load_jsonl(f)}
    data |= arts
    
save_jsonl(out_file, data.values())