In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import logging
from tqdm import tqdm 
from tqdm import trange
import json
import os
import time
import random

In [2]:
from urllib3.exceptions import TimeoutError, NewConnectionError, MaxRetryError, ConnectionError

In [3]:
links_df = pd.read_csv("links_all.csv", index_col=0)

In [4]:
links = links_df.LINK.values

In [9]:
scrap_path = "./scrap/"

In [22]:
for url in links:
    name = url.split("/")[-1].split(".")[0]
    if os.path.isfile(os.path.join(scrap_path, name + ".json")):
        print("File {0} exists. Continue.".format(os.path.join(scrap_path, name + ".json")))
        continue
    
    print("Processing link (" + str(url) + ")...")
    # diccionario que va a contener los párrafos del documento
    doc = {}
    url_request = requests.get(url)
    url_souped = BeautifulSoup(url_request.content, 'lxml')
    paragraphs = url_souped.find_all('p')
    p_list = []
    for p in paragraphs:
        p_dict = {}
        if p.text.strip() == "":
            continue
        p_dict["text"] = p.text
        if "class" in p.attrs:
            p_dict["class"] = p.attrs["class"]
        else:
            p_dict["class"] = "no_class"
        p_list.append(p_dict)
    doc["name"] = name
    doc["paragraphs"] = p_list
    doc["link"] = url
    print("Saving as:", name + ".json")
    with open(os.path.join(scrap_path, name + ".json"), "w+") as f:
        json.dump(doc, f)
    rand_sleep = random.uniform(0.8, 2.5)
    print("Safety sleep [{0}]...".format(rand_sleep))
    time.sleep(rand_sleep)


File ./scrap/A107-21.json exists. Continue.
File ./scrap/A106-21.json exists. Continue.
File ./scrap/A092-21.json exists. Continue.
File ./scrap/A091-21.json exists. Continue.
File ./scrap/A090-21.json exists. Continue.
File ./scrap/A065-21.json exists. Continue.
File ./scrap/A054-21.json exists. Continue.
File ./scrap/A053-21.json exists. Continue.
File ./scrap/A052-21.json exists. Continue.
File ./scrap/A051-21.json exists. Continue.
File ./scrap/A046-21.json exists. Continue.
File ./scrap/A041-21.json exists. Continue.
File ./scrap/A040-21.json exists. Continue.
File ./scrap/A038-21.json exists. Continue.
File ./scrap/A037-21.json exists. Continue.
File ./scrap/A035-21.json exists. Continue.
File ./scrap/A033-21.json exists. Continue.
File ./scrap/A032-21.json exists. Continue.
File ./scrap/A030-21.json exists. Continue.
File ./scrap/A029-21.json exists. Continue.
File ./scrap/A028-21.json exists. Continue.
File ./scrap/A027-21.json exists. Continue.
File ./scrap/A026-21.json exists

In [5]:
scrap_path = "./scrappy/"

In [6]:
counter = 0
for url in links:
    try:
        name = url.split("/")[-1].split(".")[0]
        if os.path.isfile(os.path.join(scrap_path, name + ".json")):
            counter += 1
#             print("File {0} exists. Continue.".format(os.path.join(scrap_path, name + ".json")))
            continue
        
        print("{0} files detected. Skipping scrap.".format(counter))
        
        print("Processing link (" + str(url) + ")...")
        # diccionario que va a contener los párrafos del documento
        doc = {}
        url_request = requests.get(url)
        url_souped = BeautifulSoup(url_request.content, 'lxml')
        paragraphs = url_souped.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5'])
        p_list = []
        for i, p in enumerate(paragraphs):
            p_dict = {}
            if p.text.strip() == "":
                continue
            p_dict["p_index"] = i
            p_dict["tag"] = p.name
            p_dict["text"] = p.text
            if "class" in p.attrs:
                p_dict["class"] = p.attrs["class"]
            else:
                p_dict["class"] = "no_class"
            p_list.append(p_dict)
        doc["name"] = name
        doc["paragraphs"] = p_list
        doc["link"] = url
        print("Saving as:", name + ".json")
        with open(os.path.join(scrap_path, name + ".json"), "w+") as f:
            json.dump(doc, f)
        rand_sleep = random.uniform(0.8, 1.6)
        print("Safety sleep [{0}]...".format(rand_sleep))
        time.sleep(rand_sleep)
    except (TimeoutError, NewConnectionError, MaxRetryError, ConnectionError):
        with open("connection_error_links.txt", 'a') as f:
            f.write(url + "\n")
        print("Connection Error, link: {0}".format(url))
        print("Waiting 60 seconds for retry.")
        time.sleep(60)


99 files detected. Skipping scrap.
Processing link (https://www.corteconstitucional.gov.co/Relatoria/autos/2020/A423-20.htm)...
Saving as: A423-20.json
Safety sleep [1.0162394532795949]...
199 files detected. Skipping scrap.
Processing link (https://www.corteconstitucional.gov.co/Relatoria/autos/2020/A311A-20.htm)...
Saving as: A311A-20.json
Safety sleep [1.2511692532580985]...
299 files detected. Skipping scrap.
Processing link (https://www.corteconstitucional.gov.co/Relatoria/autos/2020/A207-20.htm)...
Saving as: A207-20.json
Safety sleep [1.1740301569782692]...
399 files detected. Skipping scrap.
Processing link (https://www.corteconstitucional.gov.co/Relatoria/autos/2020/A098-20.htm)...
Saving as: A098-20.json
Safety sleep [1.2719692835125946]...
479 files detected. Skipping scrap.
Processing link (https://www.corteconstitucional.gov.co/Relatoria/autos/2019/A795-18.htm)...
Saving as: A795-18.json
Safety sleep [0.8362419289245584]...
479 files detected. Skipping scrap.
Processing li

In [7]:
scrap_path

'./scrappy/'