In [12]:
# Cell 1: Imports and folder setup
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Folder to save PDFs
folder_path = "/Users/towcenter/Desktop/fara pakistan scrape_files/fara_files"
os.makedirs(folder_path, exist_ok=True)
print(f"Folder ready: {folder_path}")


Folder ready: /Users/towcenter/Desktop/fara pakistan scrape_files/fara_files


In [13]:
# Cell 2: Scrape first page and download PDFs
base_url = "https://search.justice.gov"
search_url = "https://search.justice.gov/search?query=%22pakistan%22&op=Search&affiliate=justice_fara&page=1"

response = requests.get(search_url)
soup = BeautifulSoup(response.text, "html.parser")

# Find all results
results = soup.find_all("div", class_="content-block-item result")
print(f"Found {len(results)} results on this page.")

for result in results:
    pdf_link_tag = result.find("a", href=True)
    if pdf_link_tag:
        pdf_url = urljoin(base_url, pdf_link_tag['href'])
        pdf_name = pdf_link_tag.text.strip()
        pdf_path = os.path.join(folder_path, pdf_name)
        
        # Download PDF
        r = requests.get(pdf_url)
        with open(pdf_path, "wb") as f:
            f.write(r.content)
        print(f"Downloaded: {pdf_name}")



Found 20 results on this page.


SSLError: HTTPSConnectionPool(host='efile.fara.gov', port=443): Max retries exceeded with url: /docs/6972-Informational-Materials-20240726-2.pdf (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1032)')))

In [14]:
r = requests.get(pdf_url, verify=False)




In [15]:
# Cell 2: Scrape first page and download PDFs (ignore SSL warnings)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

base_url = "https://search.justice.gov"
search_url = "https://search.justice.gov/search?query=%22pakistan%22&op=Search&affiliate=justice_fara&page=1"

response = requests.get(search_url, verify=False)
soup = BeautifulSoup(response.text, "html.parser")

results = soup.find_all("div", class_="content-block-item result")
print(f"Found {len(results)} results on this page.")

for result in results:
    pdf_link_tag = result.find("a", href=True)
    if pdf_link_tag:
        pdf_url = urljoin(base_url, pdf_link_tag['href'])
        pdf_name = pdf_link_tag.text.strip()
        pdf_path = os.path.join(folder_path, pdf_name)
        
        r = requests.get(pdf_url, verify=False)
        with open(pdf_path, "wb") as f:
            f.write(r.content)
        print(f"Downloaded: {pdf_name}")


Found 20 results on this page.
Downloaded: 6972-Informational-Materials-20240726-2.pdf
Downloaded: 6972-Supplemental-Statement-20240726-6.pdf
Downloaded: 7270-Supplemental-Statement-20231228-1.pdf
Downloaded: 7361-Supplemental-Statement-20240828-1.pdf
Downloaded: 7361-Informational-Materials-20240321-2.pdf
Downloaded: 6807-Supplemental-Statement-20231030-10.pdf
Downloaded: 7361-Informational-Materials-20240321-1.pdf
Downloaded: 5870-Supplemental-Statement-20220330-30.pdf
Downloaded: 7361-Exhibit-AB-20240122-1.pdf
Downloaded: 7361-Exhibit-AB-20240312-2.pdf
Downloaded: 6972-Supplemental-Statement-20230725-4.pdf
Downloaded: 7270-Exhibit-AB-20230511-1.pdf
Downloaded: 6972-Exhibit-AB-20220809-3.pdf
Downloaded: 6972-Informational-Materials-20230725-1.pdf
Downloaded: 7267-Exhibit-AB-20241022-5.pdf
Downloaded: 7351-Registration-Statement-20231208-1.pdf
Downloaded: 7270-Registration-Statement-20230511-1.pdf
Downloaded: 6979-Exhibit-AB-20221019-2.pdf
Downloaded: 6682-Amendment-20221214-7.pdf
Dow

In [17]:
!open .


In [18]:
pdf_path = os.path.join(folder_path, pdf_name)
print(f"Saving PDF to: {pdf_path}")


Saving PDF to: /Users/towcenter/Desktop/fara pakistan scrape_files/fara_files/5870-Exhibit-AB-20240705-46.pdf


In [21]:
import re

# Function to sanitize filenames
def clean_filename(filename):
    # Remove or replace invalid characters
    return re.sub(r'[\/:*?"<>|]', "_", filename)

# Cell 5: Loop over all pages with sanitized filenames
import time

base_url = "https://search.justice.gov"
total_pages = 55  # approximate total pages to cover 819 results

for page_num in range(1, total_pages + 1):
    search_url = f"https://search.justice.gov/search?query=%22pakistan%22&op=Search&affiliate=justice_fara&page={page_num}"
    print(f"\nScraping page {page_num}/{total_pages} ...")
    
    try:
        response = requests.get(search_url, verify=False, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to load page {page_num}: {e}")
        continue
    
    soup = BeautifulSoup(response.text, "html.parser")
    results = soup.find_all("div", class_="content-block-item result")
    print(f"Found {len(results)} results on this page.")
    
    for result in results:
        pdf_link_tag = result.find("a", href=True)
        if pdf_link_tag:
            pdf_url = urljoin(base_url, pdf_link_tag['href'])
            pdf_name = clean_filename(pdf_link_tag.text.strip())
            pdf_path = os.path.join(folder_path, pdf_name)
            
            if os.path.exists(pdf_path):
                print(f"Already exists: {pdf_name}")
                continue
            
            try:
                r = requests.get(pdf_url, verify=False, timeout=15)
                r.raise_for_status()
                with open(pdf_path, "wb") as f:
                    f.write(r.content)
                print(f"Downloaded: {pdf_name}")
            except requests.exceptions.RequestException as e:
                print(f"Skipped {pdf_name} due to error: {e}")
    
    time.sleep(1)



Scraping page 1/55 ...
Found 20 results on this page.
Downloaded: 6972-Informational-Materials-20240726-2.pdf
Downloaded: 6972-Supplemental-Statement-20240726-6.pdf
Downloaded: 7270-Supplemental-Statement-20231228-1.pdf
Downloaded: 7361-Supplemental-Statement-20240828-1.pdf
Downloaded: 7361-Informational-Materials-20240321-2.pdf
Downloaded: 6807-Supplemental-Statement-20231030-10.pdf
Downloaded: 7361-Informational-Materials-20240321-1.pdf
Downloaded: 5870-Supplemental-Statement-20220330-30.pdf
Downloaded: 7361-Exhibit-AB-20240122-1.pdf
Downloaded: 7361-Exhibit-AB-20240312-2.pdf
Downloaded: 6972-Supplemental-Statement-20230725-4.pdf
Downloaded: 7270-Exhibit-AB-20230511-1.pdf
Downloaded: 6972-Exhibit-AB-20220809-3.pdf
Downloaded: 6972-Informational-Materials-20230725-1.pdf
Downloaded: 7267-Exhibit-AB-20241022-5.pdf
Downloaded: 7351-Registration-Statement-20231208-1.pdf
Downloaded: 7270-Registration-Statement-20230511-1.pdf
Downloaded: 6979-Exhibit-AB-20221019-2.pdf
Downloaded: 6682-Amen

  def object_was_parsed(


Found 20 results on this page.
Downloaded: 6672-Informational-Materials-20201029-525.pdf
Downloaded: 6911-Registration-Statement-20210120-2.pdf
Downloaded: 6869-Informational-Materials-20220104-416.pdf
Downloaded: 3718-Informational-Materials-20190923-5.pdf
Downloaded: 7246-Informational-Materials-20240725-66.pdf
Downloaded: 6912-Registration-Statement-20210121-2.pdf
Downloaded: 6845-Informational-Materials-20201122-83.pdf
Downloaded: 5666-Informational-Materials-20170330-47.pdf
Downloaded: 6869-Informational-Materials-20220511-518.pdf
Downloaded: 8_29_22 Monday
Downloaded: 7246-Informational-Materials-20240517-49.pdf
Downloaded: 6921-Registration-Statement-20210202-2.pdf
Downloaded: 6869-Informational-Materials-20211026-365.pdf
Downloaded: 6869-Informational-Materials-20210524-225.pdf
Downloaded: 6869-Informational-Materials-20200105-82.pdf
Downloaded: 6869-Informational-Materials-20210930-344.pdf
Downloaded: 3718-Informational-Materials-20190812-3.pdf
Downloaded: 6869-Informational-M

In [3]:
!pwd


/Users/towcenter/Desktop/python/fara pakistan scrape_files
