# Bookmarks Parser

Code to parse and extract bookmarks from an exported HTML file from Google Chrome. The code is written in Python and uses the BeautifulSoup library to parse the HTML file. The code extracts the title and URL of each bookmark and prints them to the console.

## How to export bookmarks from Google Chrome or Edge

1. Open Google Chrome or Edge.
2. Click on the three dots in the top right corner of the browser window.
3. Click on the "Bookmarks" or "Favorites" option.
4. Depending on your browser, you should be able to find an option to export bookmarks or favorites. This might involve selecting a bookmark manager, from which should be hopefully straightforward.
5. Choose the location where you want to save the exported bookmarks file and click "Save".

To be used in this example, the exported file should be in the same directory as the notebook and named `bookmarks.html`.


In [1]:
from bs4 import BeautifulSoup, Comment

def clean_html(html_file_path, output_file_path=None):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')

    # Remove <script> and <style> elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()

    # Remove HTML comments
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Get text from the body
    body = soup.body

    # Extract the core content text
    text_content = body.get_text(separator='\n', strip=True)

    # Optional: Write the cleaned content to an output file
    if output_file_path:
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(text_content)
    
    return text_content

# Example usage
# cleaned_content = clean_html('example.html', 'cleaned_content.txt')
# print(cleaned_content)


## Parse Bookmarks

This code reads the exported HTML file from a Chromium browser and extracts the title and URL of each bookmark. The code uses the BeautifulSoup library to parse the HTML file and extract the required information.

In [2]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import os

def parse_bookmarks(html_file_path, start_date, end_date, download_dir=None):
    ### Parse bookmarks from an HTML file exported from a browser
    # html_file_path: path to the HTML file
    # start_date: bookmarks starting from this date will be included
    # end_date: bookmarks up to this date will be included
    # download_dir: directory to save the HTML content of the bookmarks
    
    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        
    bookmarks = []

    # Convert date strings to datetime objects
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')

    def extract_bookmarks(soup_element):
        for item in soup_element.find_all(['a', 'h3']):
            if item.name == 'a':
                add_date = item.get('add_date')
                if add_date:
                    # Convert the bookmark's add_date to a datetime object
                    bookmark_date = datetime.fromtimestamp(int(add_date))
                    # Check if the bookmark_date is within the specified range
                    if start_date <= bookmark_date <= end_date:
                        url = item.get('href')
                        bookmark = {
                            'title': item.get_text(),
                            'url': url,
                            'add_date': bookmark_date,
                            'icon': item.get('icon'),
                        }
                        # Fetch and save HTML content of the bookmark if download_dir is specified
                        if download_dir:
                            try:
                                response = requests.get(url)
                                if response.status_code == 200:
                                    html_content = response.text # HTML content of the page - clean it if needed
                                    file_name = f"{bookmark_date.strftime('%Y%m%d%H%M%S')}_{item.get_text().replace(' ', '_')}.html"
                                    file_path = os.path.join(download_dir, file_name)
                                    with open(file_path, 'w', encoding='utf-8') as html_file:
                                        html_file.write(html_content)
                                    bookmark['html_file'] = file_path
                            except requests.exceptions.RequestException as e:
                                print(f"Error fetching {url}: {e}")
                        
                        bookmarks.append(bookmark)
            elif item.name == 'h3':
                folder_name = item.get_text()
                folder = {
                    'folder_name': folder_name,
                    'bookmarks': []
                }
                bookmarks.append(folder)
                next_sibling = item.find_next_sibling()
                if next_sibling and next_sibling.name == 'dl':
                    extract_bookmarks(next_sibling)
    
    extract_bookmarks(soup.body)
    
    return bookmarks


In [None]:
# Example usage
download_dir = 'downloaded_bookmarks_html'
os.makedirs(download_dir, exist_ok=True)

# bookmarks = parse_bookmarks('bookmarks.html', '2022-01-01', '2022-12-31', download_dir)
# for bookmark in bookmarks:
#     print(bookmark)

# Example usage without downloading HTML
bookmarks_without_download = parse_bookmarks('bookmarks.html', '2022-01-01', '2022-12-31')
for bookmark in bookmarks_without_download:
    print(bookmark)
