In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    input_file_name,
    col,
    count,
    regexp_extract,
    when,
    lit,
    countDistinct,
    regexp_replace,
    coalesce,
    sum,
)

# Create Spark Session
spark = (
    SparkSession.builder.appName("EQ Data Processing")
    .config(
        "spark.jars",
        "/Users/adhikram.m/personal/Study_Resources/Pyspark/postgresql-42.7.4.jar",
    )
    .getOrCreate()
)

file_names = [
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/Amazon Leads CSV Outreach - Amazon Sellers - Amazon Leads CSV Outreach - Amazon Sellers.csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/Brand Buddy 3K 12_02 - Fir Jeff Funnel - Brand Buddy 3K 12_02 - Fir Jeff Funnel.csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/BrandBuddy More Data - Sheet1.csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/CloseAmzLeadList - Prospects_2022-01-20_2116 - CloseAmzLeadList - Prospects_2022-01-20_2116.csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/New List for Jeff Funnel Contacts from list 11_20 - New List for Jeff Funnel Contacts from list 11_20.csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/New List for Jeff Funnel Contacts from list 11_20 (1) - New List for Jeff Funnel Contacts from list 11_20 (1).csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/Outreach Lead Data - Amazon Sellers - Outreach Lead Data - Amazon Sellers.csv",
    "/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/SkyViewPartners.xlsx - Amz Seller Directory - SkyViewPartners.xlsx - Amz Seller Directory.csv",
]

union_data = None
for file in file_names:
    print(file)
    df_csv = (
        spark.read.option("header", "true")
        .csv(file)
        .withColumn("filename", regexp_extract(input_file_name(), "[^/]+$", 0))
        .withColumnRenamed("Seller ID", "seller_id")
    )
    # df_csv.show()
    if "seller_url" not in df_csv.columns:
        df_csv = df_csv.withColumn("seller_url", lit(""))

    updated_df = df_csv.withColumn(
        "seller_id",
        when(
            col("seller_id").isNull(),
            regexp_extract(col("seller_url"), "seller=([A-Z0-9]+)", 1),
        ).otherwise(col("seller_id")),
    ).select("filename", "seller_id")
    if union_data is None:
        union_data = updated_df
    else:
        union_data = union_data.union(updated_df)

# Read CSVs and extract seller ID from URL when needed
union_data.groupBy("filename").agg(
    count("*").alias("count"), countDistinct("seller_id")
).alias("distinct_count").show()
# # Rest of your processing remains same
df_table = (
    spark.read.option("header", "true")
    .csv("/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/company_status.csv")
    .withColumn("status", coalesce(col("status"), lit("Empty")))
)
df_table.groupBy("status").count().show(truncate=False)
joined_df = union_data.join(df_table, "seller_id", "left")
pivoted_df = joined_df.groupBy("filename").pivot("status").agg(count("*")).fillna(0)

pivoted_df.show(truncate=False)
joined_df.repartition(1).write.csv("/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/agg", header=True)
pivoted_df.write.csv("/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/output", header=True)

/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/Amazon Leads CSV Outreach - Amazon Sellers - Amazon Leads CSV Outreach - Amazon Sellers.csv
/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/Brand Buddy 3K 12_02 - Fir Jeff Funnel - Brand Buddy 3K 12_02 - Fir Jeff Funnel.csv
/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/BrandBuddy More Data - Sheet1.csv
/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/CloseAmzLeadList - Prospects_2022-01-20_2116 - CloseAmzLeadList - Prospects_2022-01-20_2116.csv
/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/New List for Jeff Funnel Contacts from list 11_20 - New List for Jeff Funnel Contacts from list 11_20.csv
/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/New List for Jeff Funnel Contacts from list 11_20 (1) - New List for Jeff Funnel Contacts from list 11_20 (1).csv
/Users/adhikram.m/personal/Study_Resources/Pyspark/EQ/Outreach Lead Data - Amazon Sellers - Outreach Lead Data - Amazon Sellers.csv
/Users/adhikram.m/

                                                                                

+---------------------------------------------------------------------------------------------------------------------------------------------------------+----+----------------------------+----------------------------+-------------------+-----+-------------+---------------------------+-----------------+-----------------+-----+
|filename                                                                                                                                                 |null|Address / Phone Not Matching|Address Not Found on Website|Correct (Same Logo)|Empty|Final Correct|Fuzzy Score Below Threshold|Logo Not Matching|Manual Processing|Maybe|
+---------------------------------------------------------------------------------------------------------------------------------------------------------+----+----------------------------+----------------------------+-------------------+-----+-------------+---------------------------+-----------------+-----------------+-----+
|SkyViewPartn

                                                                                