In [10]:
import numpy as np
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import urljoin, urlparse

In [None]:
# # URL of the page you want to scrape
# url = 'https://www.example.com/user-agreement'

# # Make a request to get the page content
# response = requests.get(url)

# # Parse the HTML content of the page
# soup = BeautifulSoup(response.text, 'html.parser')

# # Identify the container of the user agreement section.
# # This is a placeholder; you'll need to inspect the webpage to find the right selector
# user_agreement_container = soup.find(id='userAgreementSection')

# # If the container was found, print its content
# if user_agreement_container:
#     print(user_agreement_container.text)
# else:
#     print('User agreement section not found.')

In [4]:
# === Load CSV ===
df = pd.read_csv("company_list.csv")  # Make sure this CSV has "company" and "website" columns
urls = df["URL"].dropna().unique()

# === Output Directory ===
os.makedirs("agreements", exist_ok=True)

# === Keyword list (case-insensitive search) ===
keyword_list = [
    "privacy", "user agreement", "terms", "conditions",
    "data use", "policy", "legal", "disclaimer", "statement"
]
keyword_pattern = re.compile("|".join(keywords), re.IGNORECASE)

In [8]:
def is_relevant_link(text):
    return any(kw in text.lower() for kw in keyword_list)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}

In [15]:
file_index = 1

for i, url in enumerate(urls):
    print(f"[{i+1}/{len(urls)}] Checking {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            print(f" - Skipping (HTTP {response.status_code})")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)

        found = False
        for link in links:
            link_text = link.get_text(strip=True).lower()
            href = link['href']

            if is_relevant_link(link_text):
                # Resolve absolute URL
                agreement_url = urljoin(url, href)
                print(f" - Found link: {link_text} → {agreement_url}")

                try:
                    # Fetch the agreement page
                    agreement_response = requests.get(agreement_url, headers=HEADERS, timeout=10)
                    if agreement_response.status_code != 200:
                        print(f"   - Failed to fetch: HTTP {agreement_response.status_code}")
                        continue

                    agreement_soup = BeautifulSoup(agreement_response.text, 'html.parser')
                    content_tags = agreement_soup.find_all(['div', 'section', 'article', 'main'])

                    # Extract the largest non-empty block of text
                    content_texts = [
                        tag.get_text(separator=" ", strip=True) for tag in content_tags
                        if tag.get_text(strip=True)
                    ]
                    if not content_texts:
                        continue

                    # Choose the longest block of text (likely the full agreement)
                    main_text = max(content_texts, key=len)

                    filename = f"agreements/{file_index:04d}.txt"
                    with open(filename, "w", encoding="utf-8") as f:
                        f.write(main_text)

                    print(f"   - Saved to {filename}")
                    file_index += 1
                    found = True
                    break  # Stop after one valid link found

                except Exception as e:
                    print(f"   - Error fetching linked page: {e}")
                    continue

        if not found:
            print(" - No relevant agreement links found.")

    except Exception as e:
        print(f" - Error accessing {url}: {e}")

    time.sleep(1)  # polite pause

print("\n Finished all sites.")

[1/22] Checking https://bushelpowered.com/
 - Found link: privacy notice → https://bushelpowered.com/privacy-notice
   - Saved to agreements/0001.txt
[2/22] Checking https://www.cibotechnologies.com/
 - Found link: privacy policy → http://www.cibotechnologies.com/privacy-policy/
   - Saved to agreements/0002.txt
[3/22] Checking https://agrevolution.in/
 - No relevant agreement links found.
[4/22] Checking https://www.fbn.com/
 - Found link: terms of service → https://www.fbn.com/terms-of-service
   - Saved to agreements/0003.txt
[5/22] Checking https://www.indigoag.com/
 - Found link: privacy policy | → https://www.indigoag.com/privacy-policy?hsLang=en-us
   - Saved to agreements/0004.txt
[6/22] Checking https://producepay.com/
 - Skipping (HTTP 403)
[7/22] Checking https://agrivida.com/
 - No relevant agreement links found.
[8/22] Checking https://innovafeed.com/en/
 - Found link: legal notice → https://innovafeed.com/mentions-legales/
   - Saved to agreements/0005.txt
[9/22] Checking