In [5]:
import bs4
import os
import requests

In [7]:
import hashlib
import json

def calculate_checksum(data):
    # Calculate checksum using SHA-256 hashing algorithm
    sha256 = hashlib.sha256()
    sha256.update(data.encode('utf-8'))
    return sha256.hexdigest()

def load_and_check_data(file_path, new_data, encoding = 'utf-8'):
    # Detect if the content is JSON or HTML based on the file extension
    is_json = file_path.lower().endswith('.json')
    try:
        # Read existing content from file
        with open(file_path, 'r', encoding=encoding) as file:
            existing_data = file.read()
            
        if is_json:
            # Decode existing JSON data and new JSON data
            existing_data = json.loads(existing_data)
            new_data = json.loads(new_data)

            # Calculate checksums for existing and new JSON content
            existing_checksum = calculate_checksum(json.dumps(existing_data, sort_keys=True))
            new_checksum = calculate_checksum(json.dumps(new_data, sort_keys=True))
        else:
            # Calculate checksums for existing and new HTML content
            existing_checksum = calculate_checksum(existing_data)
            new_checksum = calculate_checksum(new_data)

        # Compare checksums to check for changes
        if existing_checksum != new_checksum:
            print(f"{ 'JSON' if is_json else 'HTML' } content has changed.")
            # Write new content to file
            with open(file_path, 'w',encoding=encoding) as file:
                if is_json:
                    json.dump(new_data, file, indent=4, sort_keys=True)
                else:
                    file.write(new_data)
            print(f"{ 'JSON' if is_json else 'HTML' } updated successfully.")
            # Return updated content
            return new_data
        else:
            print(f"{ 'JSON' if is_json else 'HTML' } content remains unchanged.")
            # Return existing content
            if is_json:
                return existing_data
            else:
                return existing_data

    except FileNotFoundError:
        # If the file doesn't exist, create it with new content
        with open(file_path, 'w',encoding=encoding) as file:
            if is_json:
                json.dump(json.loads(new_data), file, indent=4, sort_keys=True)
            else:
                file.write(new_data)
        print(f"File created with { 'JSON' if is_json else 'HTML' } content.")
    
        # Return new content
        if is_json: 
            return json.loads(new_data)
        else:
            return new_data



In [8]:
ruler_url= "https://www.rulers.org/"
rulerorg_html = requests.get(ruler_url)
rulerorg_bs = bs4.BeautifulSoup(rulerorg_html.text)
rulerorg_href_list = ["https://www.rulers.org/"+i.get('href') for i in rulerorg_bs.center.find_all('a', href=True)]
rulerorg_html_name_list = [i.get('href') for i in rulerorg_bs.center.find_all('a', href=True)]
rulerorg_href_list

['https://www.rulers.org/rula1.html',
 'https://www.rulers.org/rula2.html',
 'https://www.rulers.org/rulb1.html',
 'https://www.rulers.org/rulb2.html',
 'https://www.rulers.org/rulc1.html',
 'https://www.rulers.org/rulc2.html',
 'https://www.rulers.org/rulc3.html',
 'https://www.rulers.org/rulc4.html',
 'https://www.rulers.org/ruld.html',
 'https://www.rulers.org/rule.html',
 'https://www.rulers.org/rulf.html',
 'https://www.rulers.org/rulg1.html',
 'https://www.rulers.org/rulg2.html',
 'https://www.rulers.org/rulh.html',
 'https://www.rulers.org/ruli.html',
 'https://www.rulers.org/ruljk.html',
 'https://www.rulers.org/rull.html',
 'https://www.rulers.org/rulm1.html',
 'https://www.rulers.org/rulm2.html',
 'https://www.rulers.org/ruln1.html',
 'https://www.rulers.org/ruln2.html',
 'https://www.rulers.org/rulp1.html',
 'https://www.rulers.org/rulp2.html',
 'https://www.rulers.org/rulqr.html',
 'https://www.rulers.org/ruls1.html',
 'https://www.rulers.org/ruls2.html',
 'https://www.rule

In [8]:
len(rulerorg_href_list)

445

In [2]:
import re
def clean_html_string(input_text):
    clean_html = re.sub(r'<.*?>', '', input_text)
    # Remove tabs and newlines
    clean_text = clean_html.replace('\t', '').replace('\n', '')
    return clean_text

In [10]:
RULER_DOCUMENT_PATH = "./"
ruler_metadata = {
            "type": None,
            "source": None,
            "watchlist": None,
            "doc_local_path": None,}
ruler_html_document_data = []
for url, name in zip(rulerorg_href_list, rulerorg_html_name_list):
    try:
        ruler_html_filename = name
        ruler_html_filepath = os.path.join(RULER_DOCUMENT_PATH, ruler_html_filename)
        content_html = requests.get(url)
        new_html_content = load_and_check_data(ruler_html_filepath, content_html.text, encoding="ISO-8859-1")
        #TODO clean html
        clean_html = clean_html_string(new_html_content)
        ruler_html_content = clean_html
        _ruler_metadata = ruler_metadata.copy()
        _ruler_metadata['type'] = "Individual"
        _ruler_metadata['source'] = str(url)
        _ruler_metadata['watchlist'] = "PEP"
        _ruler_metadata['doc_local_path'] = str(ruler_html_filepath)       
        ruler_html_document_data.append((ruler_html_content, _ruler_metadata)) 
    except:
        print(new_html_content)
        break

clean_html

HTML content remains unchanged.


'Countries Ab-AmRulersCountries Ab-AmContents of this page:Afghanistan - Albania - Algeria - American SamoaAbkhazia: see under Georgia.Abu Dhabi: see under United Arab Emirates.Acre: see under Brazil.Aden: see under Yemen.AfghanistanNote: Before 1881 there were essentially four rulers\' capitals: Kabul, Herat, Kandahar, and Peshawar (the last now in Pakistan). All the rulers belong to the Abdali tribal group, whose name was changed to Dorrani on the accession of Ahmad Shah. They belong either to the Saddozay segment of the Popalzay clan (typically with the style padshah [king]) or to the Mohammadzay segment of the Barakzay clan (typically with the style amir, in full Amir al-Mo´menin [Leader of the Faithful]). The Mohammadzay also furnished the Saddozay kings frequently with top counselors, who served occasionally as regents, identified with the epithet Mohammadzay.KabulKings- Saddozay segment -   Jul 1747 - 16 Oct 1772  Ahmad Shah "Dorr-e Dorran"         (b. 1724 - d. 1772)16 Oct 1772

In [18]:
new_html_content

'<TITLE>Index E</TITLE>\n<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">\n<A HREF="index.html">Rulers</A>\n<H1>Index E</H1>\n<B>Eagle, James P(hilip)</B> (b. Aug. 10, 1837, Maury county, Tenn. - d. Dec. 20, 1904, Little Rock, Ark.), governor of Arkansas (1889-93).<P>\n\n<B>Eagleburger, Lawrence (Sidney)</B> (b. Aug. 1, 1930, Milwaukee, Wis. - d. June 4, 2011, Charlottesville, Va.), U.S. secretary of state (1992-93). He was also ambassador to Yugoslavia (1977-81). He was the first career Foreign Service Officer to serve as secretary of state.<P>\n\n<B>Eagleton, Thomas (Francis)</B> (b. Sept. 4, 1929, St. Louis, Mo. - d. March 4, 2007, St. Louis), U.S. politician. He was a senator from Missouri (1968-87). The initial Democratic vice presidential nominee in 1972, he withdrew after reports of receiving electroshock therapy for clinical depression.<P>\n\n<A NAME="eanes"><TABLE ALIGN="Left"><TR><TD><IMG SRC="eanes.jpg" HEIGHT="100"><BR>Eanes</TR></TABLE>\n<B>Eanes, A