In [None]:
import pandas as pd
import requests
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# --- UTILITY FUNCTIONS ---

def is_valid_url(url):
    try:
        parsed = urlparse(url)
        return parsed.scheme in ("http", "https")
    except:
        return False

def get_domain_status(url):
    try:
        start = time.time()
        response = requests.get(url, timeout=5)
        load_time = round(time.time() - start, 2)
        return response.status_code, load_time
    except Exception as e:
        return str(e), None

def detect_cms(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")

        # Basic CMS fingerprints
        if "wp-content" in response.text or "WordPress" in response.text:
            return "WordPress"
        if "shopify" in response.text:
            return "Shopify"
        if "drupal" in response.text:
            return "Drupal"
        if "joomla" in response.text:
            return "Joomla"
        if "squarespace" in response.text:
            return "Squarespace"
        if "wix.com" in response.text:
            return "Wix"

        return "Unknown"
    except:
        return "Error"

# --- MAIN FUNCTION ---

def filter_links(input_csv="actual_websites.csv", output_csv="filtered_websites.csv"):
    df = pd.read_csv(input_csv)

    if "website" not in df.columns:
        raise ValueError("Input CSV must contain a 'website' column.")

    print(f"[+] Processing {len(df)} websites for status, CMS, and speed...")

    results = []

    for idx, row in df.iterrows():
        url = row["website"]

        if not is_valid_url(url):
            print(f"[!] Skipping invalid URL: {url}")
            continue

        print(f"    → Checking: {url}")

        status, speed = get_domain_status(url)
        cms = detect_cms(url)

        results.append({
            "website": url,
            "status": status,
            "cms": cms,
            "load_time_sec": speed
        })

    result_df = pd.DataFrame(results)
    result_df.to_csv(output_csv, index=False)

    print(f"[✓] Saved filtered results to {output_csv}")

# --- OPTIONAL: TESTING CLI ---
if __name__ == "__main__":
    filter_links()


In [None]:
import pandas as pd

def save_valid_websites(input_csv, output_csv, max_load_time):
    """
    Filters websites that have status 200 and load time <= max_load_time,
    then saves them to output_csv.

    Parameters:
    - input_csv (str): Path to filtered_websites.csv file.
    - output_csv (str): Path to save selected websites CSV.
    - max_load_time (float): Maximum acceptable load time in seconds.
    """

    # Load filtered websites data
    df = pd.read_csv(input_csv)

    # Ensure status is numeric for filtering, coerce errors to NaN (like errors)
    df['status'] = pd.to_numeric(df['status'], errors='coerce')
    df['load_time_sec'] = pd.to_numeric(df['load_time_sec'], errors='coerce')

    # Filter conditions: status == 200, load_time_sec <= max_load_time
    filtered_df = df[(df['status'] == 200) & (df['load_time_sec'] <= max_load_time)]

    # Save filtered valid websites
    filtered_df.to_csv(output_csv, index=False)
    print(f"[✓] Saved {len(filtered_df)} valid websites to '{output_csv}'")

# Example usage:
if __name__ == "__main__":
    save_valid_websites("filtered_websites.csv", "selected_websites.csv", max_load_time=2.0)
