In [1]:
import json
import pycld2 as cld2

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path

In [2]:
def load(file):
    with open(file, 'r', encoding="utf-8") as f:
        return f.read()
        
def save(file, string):
    with open(file, 'w', encoding="utf-8") as f:
        f.write(string)
        
def load_jsonl(file):
    with open(file, 'r', encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

def save_jsonl(file, obj_list):
    jstrings = [json.dumps(obj, ensure_ascii=False) for obj in obj_list]
    save(
        file,
        '\n'.join(jstrings) + '\n'
        )

## Extract Data

In [76]:
def extract_data(html_art):
    metadata = json.loads(html_art["metadata"])
    article_part = BeautifulSoup(html_art["article_part"], 'html.parser')

    # extract headlines
    headline = html_art["headline"]
    isReliable, textBytesFound, details, vectors = cld2.detect(
        headline, returnVectors=True
    )
    # print(headline)
    # print(vectors)

    # assert len(vectors) == 2, html_art["id"]
    en_headline = headline[vectors[0][0]:vectors[0][1]].strip()
    zh_headline = headline[vectors[0][1]:].strip()

    # extract articles
    paras = article_part.find_all("p")
    description = paras[0].get_text(separator='\n')

    en_text = []
    zh_text = []
    collecting_en = True
    for p in paras[1:]:
        # print(p)

        if p.text == '\xa0':
            collecting_en = False
            continue

        text = p.get_text(separator='\n')
        # print(text)

        if collecting_en:
            en_text.append(text)
        else:
            zh_text.append(text)

    en_text = '\n'.join(en_text)
    zh_text = '\n'.join(zh_text)

    if not en_text or not zh_text:
        return None
    
    en_text = f"{en_headline}\n\n{en_text}"
    zh_text = f"{zh_headline}\n\n{zh_text}"

    return {
        "PTS_id": html_art["id"],
        "en_text": en_text,
        "zh_text": zh_text,
        "description": description,
        "url": metadata["url"],
        "headline": html_art["headline"],
        "keywords": metadata.get("keywords") or [],
        "datePublished": metadata["datePublished"],
    }

In [7]:
input_file = "/home/zchen/crawler/data/PTS/raw/articles_0-100.jsonl"
html_arts = load_jsonl(input_file)

In [69]:
art = extract_data(html_arts[4])
print()
print(art)

President Departs for Central America 小英訪瓜地馬拉.貝里斯  行前發表談話
((0, 38, 'Unknown', 'un'), (38, 51, 'ChineseT', 'zh-Hant'))
President Departs for Central America
小英訪瓜地馬拉.貝里斯  行前發表談話
<p>Tsai Ing-wen, President: “Freedom, democracy, and sustainability are the values of Taiwan that we want to share with our good friends from all over the world. Leading a country out is a president's most important responsibility. During this visit, we will convey Taiwan's three viewpoints to the world. The first is that Taiwan will share the value of freedom and openness with the world. We are a maritime nation and the world's Taiwan so we will absolutely not isolate ourselves in a corner of the Taiwan Strait. We also hope to find a bigger space for Taiwan as well as business opportunities during this visit.”<br/><br/><br/><br/><br/>總統蔡英文表示：「自由、民主、永續，是我們想和全世界的好朋友分享的台灣價值，帶領國家走出去就是總統最重要的任務，那這次出訪我們將會向世界傳達台灣的三個主張，第一，台灣會和世界共享自由開放的價值，我們是海洋國家，是世界的台灣，所以我們絕對不會把自己封閉在台灣海峽的一角，我們也期待在出訪的時候，為台灣尋找更大的空間，以及商機。」<br/><br/> </p>
Ts

In [77]:
input_file = "/home/zchen/crawler/data/PTS/raw/articles_0-100.jsonl"
out_path = "/home/zchen/crawler/data/PTS/extracted"

html_arts = load_jsonl(input_file)
arts = []
for html_art in tqdm(html_arts):
    art = extract_data(html_art)
    if art is not None:
        arts.append(art)

out_path = Path(out_path)
out_path.mkdir(parents=True, exist_ok=True)
file_name = Path(input_file).name

save_jsonl(
    out_path / file_name,
    arts
    )

100%|██████████| 900/900 [00:00<00:00, 1524.15it/s]


## Merge datafiles

In [49]:
datapath = r"D:\winuser\Downloads\crawler\data\extracted"
out_file = r"D:\winuser\Downloads\crawler\data\RTL_Taiwan_EN_news_(partial).jsonl"

files = Path(datapath).glob("*")
data = {}
for f in files:
    arts = {art["PTS_id"]: art for art in load_jsonl(f)}
    data |= arts
    
save_jsonl(out_file, data.values())