In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode

# Normalize and Dedup urls

Explanation

Normalization Function (normalize_url):

Scheme and Netloc: Convert the scheme (http, https) and netloc (domain) to lowercase to handle case insensitivity.
Path: Remove trailing slashes for consistency.
Query Parameters: Parse and sort the query parameters to ensure consistent ordering.
Rebuild URL: Combine the components back into a normalized URL.
De-duplication Function (deduplicate_urls):

Use a set to track seen URLs.
Iterate through the list of URLs, normalize each one, and add it to the set if it hasn't been seen before.

In [10]:
def normalize_url(url):
    parsed_url = urlparse(url)

    scheme = parsed_url.scheme.lower()
    netloc = parsed_url.netloc.lower()
    path = parsed_url.path.rstrip('/')
    query = urlencode(sorted(parse_qsl(parsed_url.query)))
    
    normalize_url = urlunparse((scheme, netloc, path, parsed_url.params, query, parsed_url.fragment))
    
    return normalize_url

In [13]:
def dedup_urls(urls):
    seen_urls = set()
    deduped_urls = []
    for url in urls:
        normalized_url = normalize_url(url)
        if normalized_url not in seen_urls:
            deduped_urls.append(url)
            seen_urls.add(normalized_url)
    return deduped_urls

# Testing

In [16]:
# Example usage
urls = [
    "https://example.com/path/?b=2&a=1",
    "http://example.com/path/?a=1&b=2",
    "https://example.com/path?b=2&a=1",
    "https://example.com/path/?a=1&b=2#fragment",
    "http://example.com/path",
    "http://example.com/path/",
    "https://example.com/path/",
    "https://example1.com/order/?checkout=true",
]

unique_urls = dedup_urls(urls)
print("Unique URLs:")
for url in unique_urls:
    print(url)

Unique URLs:
https://example.com/path/?b=2&a=1
http://example.com/path/?a=1&b=2
https://example.com/path/?a=1&b=2#fragment
http://example.com/path
https://example.com/path/
https://example1.com/order/?checkout=true


# Scalable Version

In [None]:
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
from pyspark import SparkContext, SparkConf

def normalize_url(url):
    parsed_url = urlparse(url)
    scheme = parsed_url.scheme.lower()
    netloc = parsed_url.netloc.lower()
    path = parsed_url.path.rstrip('/')
    query = urlencode(sorted(parse_qsl(parsed_url.query)))
    normalized_url = urlunparse((scheme, netloc, path, parsed_url.params, query, parsed_url.fragment))
    return normalized_url

def dedup_urls(urls):
    seen_urls = set()
    deduped_urls = []
    for url in urls:
        normalized_url = normalize_url(url)
        if normalized_url not in seen_urls:
            deduped_urls.append(url)
            seen_urls.add(normalized_url)
    return deduped_urls

def dedup_urls_spark(urls_rdd):
    normalized_urls_rdd = urls_rdd.map(normalize_url)
    unique_urls_rdd = normalized_urls_rdd.distinct()
    return unique_urls_rdd.collect()

# Spark setup
conf = SparkConf().setAppName("URL Deduplication").setMaster("local[*]")
sc = SparkContext(conf=conf)

# Example usage with Spark
urls = [
    "https://example.com/path/?b=2&a=1",
    "http://example.com/path/?a=1&b=2",
    "https://example.com/path?b=2&a=1",
    "https://example.com/path/?a=1&b=2#fragment",
    "http://example.com/path",
    "http://example.com/path/",
    "https://example.com/path/",
    "https://example1.com/order/?checkout=true",
]

urls_rdd = sc.parallelize(urls)
unique_urls = dedup_urls_spark(urls_rdd)
print("Unique URLs:")
for url in unique_urls:
    print(url)
