In [1]:
import os
import re
import json

from zipfile import ZipFile
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
class Parse:
    id_pattern = re.compile(r'\/id\/([0-9]+)')
    sequence_pattern = re.compile(r'sequence=([0-9]+)')
    author_pattern = re.compile(r'([a-z][A-Z])')

    @staticmethod
    def ID(href):
        return Parse.id_pattern.search(href).group(1)
    
    @staticmethod
    def authors(author):
        return "; ".join(Parse.author_pattern.split(author))
        
    @staticmethod
    def html(html):
        soup = BeautifulSoup(html)
        return html
    
    @staticmethod
    def sequence(href):
        return int(Parse.sequence_pattern.search(href).group(1))
    
    @staticmethod
    def split_filename(filename):
        ID, sequence = filename.split('-')
        return ID, int(sequence)
    
    @staticmethod
    def find_sequence(sequence, hrefs):
        for i in range(len(hrefs)):
            href = hrefs[i]
            if sequence == Parse.sequence(href):
                return i + 1
        return sequence
        

def load_metadata(metadata_path):
    metadata = dict()
    
    # load metadata from json
    with open(metadata_path, "r") as file:
        database = json.load(file)
    # index entries with id
    for i in database:
        key = Parse.ID(i["href"])
        metadata[key] = i
    return metadata

def dataset(zipfile, metadata):
    res = []
    pattern = re.compile(r'\.html?')
    for filename in tqdm(zipfile.namelist()):
        name, ext = os.path.splitext(filename)
        
        # filter out unwanted extension
        if pattern.match(ext) is None:
            continue
            
        # recover news id and file index
        ID, sequence = Parse.split_filename(name)
        # properties with news id
        properties = metadata[ID]
        

        if sequence > len(properties["files"]) or sequence != Parse.sequence(properties["files"][sequence-1]["href"]):
            sequence = Parse.find_sequence(sequence, [file["href"] for file in properties["files"]])  
        # append tuple with the necessary informations
        file_properties = properties["files"][sequence-1]   
        res.append((name, zipfile.read(filename).decode('utf-8','replace'), properties["title"], Parse.authors(properties["author"]), properties["href"], properties["citation"], properties["date"], file_properties["name"],file_properties["size"], file_properties["filetype"], file_properties["href"]))

    return res



In [3]:
archieves = [('globo-2020-7.zip','globo-2020-7.json'),('estado-de-sao-paulo-2020-7.zip','estado-de-sao-paulo-2020-7.json'),('valor-economico-2020-7.zip','valor-economico-2020-7.json')]

In [4]:
cwd = os.getcwd()
dfs = []
for archive in archieves:
    archive_path = os.path.join(cwd, 'data', archive[0])
    metadata_path = os.path.join(cwd, 'data', archive[1])
    metadata = load_metadata(metadata_path)
    # load zipfile
    zipfile = ZipFile(archive_path)
    # create list with files and informations
    files = dataset(zipfile, metadata)
    
    df = pd.DataFrame(files, columns=["id", "html", "title", "author", "href", "citation", "date", "filename", "filesize", "filetype", "filehref"])
    dfs.append(df)

100%|██████████| 89601/89601 [00:04<00:00, 18477.81it/s]
100%|██████████| 111415/111415 [00:06<00:00, 18047.70it/s]
100%|██████████| 59095/59095 [00:03<00:00, 16079.33it/s]


In [5]:
news = pd.concat(dfs, ignore_index=True, keys=None, levels=None, names=None)

In [6]:
news.to_csv("news.csv", index=False)