In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lower, trim
from pyspark.sql.types import StringType
import re

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("CompanyNameJoin") \
    .getOrCreate()

# Step 2: Load CSV files
abn_df = spark.read.option("header", "true").csv("abn_data.csv")
web_df = spark.read.option("header", "true").csv("common_crawl.csv")

# Step 3: Define UDF to extract domain name (company-like) from URL
def extract_company_name(url):
    try:
        match = re.search(r'^(?:https?://)?(?:www\.)?([^./]+)', url)
        return match.group(1).lower() if match else None
    except:
        return None

extract_company_name_udf = udf(extract_company_name, StringType())

# Step 4: Apply UDF and normalize fields
web_df = web_df.withColumn("extracted_company_name", extract_company_name_udf(col("url")))
abn_df = abn_df.withColumn("normalized_entity_name", lower(trim(col("Entity Name"))))

# Step 5: Join on normalized company name
joined_df = abn_df.join(
    web_df,
    abn_df["normalized_entity_name"] == web_df["extracted_company_name"],
    "inner"
)

# Step 6: Show joined data
joined_df.select(
    "ABN", "Entity Name", "Entity Type", "Entity Status",
    "url", "company_name", "industry", "Entity State"
).show(truncate=False)

# Optional: Save output
joined_df.write.mode("overwrite").csv("output.csv", header=True)
