In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from collections import deque
from requests.exceptions import RequestException
from pyspark.sql import SparkSession

In [None]:
from urllib.parse import urlparse, urljoin
from collections import deque
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import time
from pyspark.sql import SparkSession

def normalize_url(url):
    parsed = urlparse(url)
    scheme = parsed.scheme or 'http'
    netloc = parsed.netloc.lower().replace('www.', '')
    path = parsed.path
    if not path or path == '/' or path.lower() in ['/index.html', '/index.htm']:
        normalized = f"{scheme}://{netloc}/"  # Always include trailing slash for root
    else:
        path = path.rstrip('/')  # Remove trailing slash for non-root paths
        normalized = f"{scheme}://{netloc}{path}"
    if parsed.query:
        normalized = f"{normalized}?{parsed.query}"
    return normalized

def crawl_website_bfs(start_url, domain, output_file="edges.csv", max_pages=5000, sleep_time=0.5):
    # Normalize domain for consistent comparison
    domain = domain.lower().replace('www.', '')

    # Initialize Spark session
    spark = SparkSession.builder.appName("WebCrawler").getOrCreate()
    visited = set()
    edges = set()
    queue = deque([normalize_url(start_url)])  # Normalize start_url

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    def is_valid_url(url, domain):
        try:
            parsed = urlparse(url)
            if parsed.scheme not in ['http', 'https'] or parsed.scheme == 'mailto':
                return False
            if parsed.netloc.lower().replace('www.', '') != domain:
                return False
            static_extensions = [
                '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.ico', '.svg', '.webp',
                '.css', '.js',
                '.mp3', '.wav', '.ogg', '.aac',
                '.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv',
                '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.odt', '.ods', '.odp',
                '.zip', '.rar', '.tar', '.gz', '.7z',
                '.xml', '.json', '.txt', '.csv', '.md',
                '.eot', '.woff', '.woff2', '.ttf', '.otf'
            ]
            if any(url.lower().endswith(ext) for ext in static_extensions):
                return False
            return True
        except Exception:
            return False

    print(f"Starting BFS crawl from {start_url}")

    while queue and len(visited) < max_pages:
        url = queue.popleft()
        normalized_url = normalize_url(url)  # Ensure URL is normalized

        if normalized_url in visited:
            continue
        visited.add(normalized_url)
        print(f"Crawling: {normalized_url} (Visited: {len(visited)}/{max_pages}, Queue: {len(queue)})")

        try:
            response = requests.get(normalized_url, headers=headers, timeout=10)
            if response.status_code in [403, 404]:
                print(f"Skipping {response.status_code} error at {normalized_url}")
                continue
            response.raise_for_status()
            content_type = response.headers.get('content-type', '')
            if 'text/html' not in content_type.lower():
                print(f"Skipping non-HTML content at {normalized_url}: {content_type}")
                continue
            try:
                soup = BeautifulSoup(response.text, 'html.parser')
            except Exception as e:
                print(f"Error parsing HTML at {normalized_url}: {e}")
                continue
            for link in soup.find_all('a', href=True):
                href = link['href']
                absolute_url = urljoin(normalized_url, href)
                normalized_absolute_url = normalize_url(absolute_url)
                if is_valid_url(absolute_url, domain):
                    if normalized_url != normalized_absolute_url:
                        edges.add((normalized_url, normalized_absolute_url))
                    if normalized_absolute_url not in visited and normalized_absolute_url not in queue:
                        queue.append(normalized_absolute_url)
        except RequestException as e:
            print(f"Error crawling {normalized_url}: {e}")
        except Exception as e:
            print(f"Unexpected error at {normalized_url}: {e}")

        time.sleep(sleep_time)

    # Save edges to CSV
    try:
        if edges:
            edge_list = list(edges)
            spark_df = spark.createDataFrame(edge_list, ["src", "dst"])
            spark_df = spark_df.dropDuplicates(["src", "dst"])
            spark_df = spark_df.filter(spark_df.src != spark_df.dst)

            # Coalesce to a single partition for a single CSV file
            spark_df = spark_df.coalesce(1)
            spark_df.write.csv(output_file, header=True, mode="overwrite")
            print(f"Saved {spark_df.count()} edges to {output_file}")
            print("\nSample of collected edges:")
            spark_df.show(5, truncate=False)
            return spark_df
        else:
            print("No edges collected.")
            return None
    finally:
        # Stop Spark session
        spark.stop()

# Example usage
if __name__ == "__main__":
    start_url = "https://it.tdtu.edu.vn"
    domain = "it.tdtu.edu.vn"
    df = crawl_website_bfs(start_url, domain, "edges.csv", max_pages=50000, sleep_time=0.3)

Starting BFS crawl from https://it.tdtu.edu.vn
Crawling: https://it.tdtu.edu.vn/ (Visited: 1/50000, Queue: 0)
Crawling: https://it.tdtu.edu.vn/en (Visited: 2/50000, Queue: 40)
Crawling: https://it.tdtu.edu.vn/giao-vien (Visited: 3/50000, Queue: 61)
Crawling: https://it.tdtu.edu.vn/gioi-thieu (Visited: 4/50000, Queue: 64)
Crawling: https://it.tdtu.edu.vn/giao-duc (Visited: 5/50000, Queue: 84)
Crawling: https://it.tdtu.edu.vn/khoa-hoc-cong-nghe (Visited: 6/50000, Queue: 86)
Crawling: https://it.tdtu.edu.vn/tin-tuc-khoa (Visited: 7/50000, Queue: 90)
Crawling: https://it.tdtu.edu.vn/doanh-nghiep (Visited: 8/50000, Queue: 101)
Crawling: https://it.tdtu.edu.vn/tuyen-sinh (Visited: 9/50000, Queue: 103)
Crawling: https://it.tdtu.edu.vn/vien-chuc (Visited: 10/50000, Queue: 109)
Skipping 404 error at https://it.tdtu.edu.vn/vien-chuc
Crawling: https://it.tdtu.edu.vn/sinh-vien (Visited: 11/50000, Queue: 108)
Skipping 404 error at https://it.tdtu.edu.vn/sinh-vien
Crawling: https://it.tdtu.edu.vn/ic

In [None]:
# Định nghĩa tên file input và output
input_file = 'part-00000-2971e2b7-3e1b-43e0-a718-dee5982a55bb-c000.csv'
output_file = 'filtered_https_output.csv'

# Mở file input để đọc và file output để ghi
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    # Đọc từng dòng trong file input
    for line in infile:
        # Tách dòng thành src và dst dựa trên dấu phẩy
        try:
            src, dst = line.strip().split(',')
            # Kiểm tra nếu dòng chứa 'https' ở cột src hoặc dst
            if 'https' in src or 'https' in dst:
                # Ghi dòng thỏa mãn vào file output
                outfile.write(line)
        except ValueError:
            # Bỏ qua các dòng không đúng định dạng (không tách được thành src,dst)
            continue

print(f"Đã lọc xong các dòng chứa https vào file {output_file}")