In [None]:
# Read config from config file
from pyspark.sql import SparkSession

from pyspark.sql.functions import regexp_replace, split, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


# Create SparkSession
spark = SparkSession.builder.appName("Stoploss").getOrCreate()

configs = spark.read.option("header", "true").csv("./data/StoplossConfigs.csv")

splitted_configs = configs.withColumn("site_name", regexp_replace(configs.site_name, '_AT:1', ''))\
    .withColumn("site_name", regexp_replace("site_name", '_AT1', ''))\
        .withColumn("parts", split("site_name", "`!`!`"))

# Add 4 new columns with this logic if any part is all numbers then it is adtag
# if a single letter then country
# if a string with . then domain
# else customer
def get_adtag(parts):
    for part in parts:
        if part.isdigit():
            return part
    return ''

def get_country(parts):
    for part in parts:
        if len(part) == 1 and part.isalpha():
            return part
    return 'L'

def get_domain(parts):
    for part in parts:
        if '.' in part:
            return part
    return ''

def get_customer(parts):
    for part in parts:
        if not part.isdigit() and not (len(part) == 1 and part.isalpha()) and '.' not in part:
            return part
    return ''

# Register UDFs
udf_get_adtag = udf(get_adtag, StringType())
udf_get_country = udf(get_country, StringType())
udf_get_domain = udf(get_domain, StringType())
udf_get_customer = udf(get_customer, StringType())
# Add new columns using UDFs
final_configs = splitted_configs.withColumn("adtag", udf_get_adtag("parts"))\
    .withColumn("country", udf_get_country("parts"))\
    .withColumn("domain", udf_get_domain("parts"))\
    .withColumn("customer", udf_get_customer("parts"))\
    .drop("parts")

# Show the result
final_configs.show(truncate=False)

In [None]:
adtags = spark.read.option("header", "true").csv("./data/Customer and Ad Tag List (Alpha + Beta) - Ad tag.csv")
adtags.show()
customers = spark.read.option("header", "true").csv("./data/Customer and Ad Tag List (Alpha + Beta) - Customer.csv")

adtag_matchings = final_configs.join(adtags, final_configs.adtag == adtags.AdTagId, "right")
adtag_matchings.repartition(1).write.option("header", "true").csv("./data/adtag_matchings")
customers_matchings = final_configs.join(customers, final_configs.customer == customers.CMId, "right")
customers_matchings.repartition(1).write.option("header", "true").csv("./data/customers_matchings")
