In [23]:
# Read config from config file
from pyspark.sql import SparkSession

from pyspark.sql.functions import regexp_replace, split, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


# Create SparkSession
spark = SparkSession.builder.appName("Stoploss").getOrCreate()

configs = spark.read.option("header", "true").csv("./data/StoplossConfigs.csv")

splitted_configs = configs.withColumn("site_name", regexp_replace(configs.site_name, '_AT:1', ''))\
    .withColumn("site_name", regexp_replace("site_name", '_AT1', ''))\
        .withColumn("parts", split("site_name", "`!`!`"))

# Add 4 new columns with this logic if any part is all numbers then it is adtag
# if a single letter then country
# if a string with . then domain
# else customer
def get_adtag(parts):
    for part in parts:
        if part.isdigit():
            return part
    return ''

def get_country(parts):
    for part in parts:
        if len(part) == 1 and part.isalpha():
            return part
    return 'L'

def get_domain(parts):
    for part in parts:
        if '.' in part:
            return part
    return ''

def get_customer(parts):
    for part in parts:
        if not part.isdigit() and not (len(part) == 1 and part.isalpha()) and '.' not in part:
            return part
    return ''

# Register UDFs
udf_get_adtag = udf(get_adtag, StringType())
udf_get_country = udf(get_country, StringType())
udf_get_domain = udf(get_domain, StringType())
udf_get_customer = udf(get_customer, StringType())
# Add new columns using UDFs
final_configs = splitted_configs.withColumn("adtag", udf_get_adtag("parts"))\
    .withColumn("country", udf_get_country("parts"))\
    .withColumn("domain", udf_get_domain("parts"))\
    .withColumn("customer", udf_get_customer("parts"))\
    .drop("parts")

# Show the result
final_configs.show(truncate=False)

+------+---------+----------------+-----------+-----+-------------------+-------------------+--------+---------+---------+-------+------+---------+
|id    |site_name|property        |environment|value|creation_date      |updation_date      |admin_id|is_active|adtag    |country|domain|customer |
+------+---------+----------------+-----------+-----+-------------------+-------------------+--------+---------+---------+-------+------+---------+
|129177|8CU12LGKP|STOP_LOSS_CUTOFF|NULL       |1000 |2017-04-26 09:30:00|2023-06-27 16:03:01|17314   |1        |         |L      |      |8CU12LGKP|
|129178|285618735|STOP_LOSS_CUTOFF|NULL       |55.0 |2017-04-26 09:30:00|2022-11-22 14:19:05|15359   |1        |285618735|L      |      |         |
|129199|111826011|STOP_LOSS_CUTOFF|NULL       |13.2 |2017-04-26 12:21:00|2022-11-22 14:19:05|15359   |1        |111826011|L      |      |         |
|129418|168715112|STOP_LOSS_CUTOFF|NULL       |11.0 |2017-04-28 08:28:00|2022-11-22 14:19:05|15359   |1        |

                                                                                

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 58481)
Traceback (most recent call last):
  File "/Users/adhikram.m/.pyenv/versions/3.11.5/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/adhikram.m/.pyenv/versions/3.11.5/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/Users/adhikram.m/.pyenv/versions/3.11.5/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/adhikram.m/.pyenv/versions/3.11.5/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/Users/adhikram.m/.pyenv/versions/3.11.5/envs/devenv/lib/python3.11/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Users/adhikram.m/.pyenv/versions/3.11.5/envs/devenv/lib/pyth

In [25]:
adtags = spark.read.option("header", "true").csv("./data/Customer and Ad Tag List (Alpha + Beta) - Ad tag.csv")
adtags.show()
customers = spark.read.option("header", "true").csv("./data/Customer and Ad Tag List (Alpha + Beta) - Customer.csv")

adtag_matchings = final_configs.join(adtags, final_configs.adtag == adtags.AdTagId, "right")
adtag_matchings.repartition(1).write.option("header", "true").csv("./data/adtag_matchings")
customers_matchings = final_configs.join(customers, final_configs.customer == customers.CMId, "right")
customers_matchings.repartition(1).write.option("header", "true").csv("./data/customers_matchings")


+--------------------+---------+
|              Ad_Tag|  AdTagId|
+--------------------+---------+
|Drugs_uid-300x250...|516804285|
|MSN_Appnexus_News...|643948221|
|MSN_Appenxus_News...|940058542|
|FoxNews_Mobile_30...|603248289|
|FoxNews_Mobile_30...|703244752|
|Foxnews_Mob_300x2...|863571015|
|Catchall_Adx_DE_D...|243818785|
|Outbrain_Mobile_3...|154561758|
|News UK_Mobile_GB...|238587860|
|Catchall_EU_DE_De...|457692309|
|MSN_Appnexus_Ente...|847244460|
|Mobile_300x250_UK...|355202542|
|FoxNews_Desktop_3...|374442364|
|Appnexus Finance_...|290788218|
|Taboola_300x250_M...|767547880|
|Ginsu_AdX_UK_NewS...|927339836|
|Fox News_Desktop_...|355124534|
|Foxnews_Desk_300x...|515371785|
|Finance_Desktop_9...|972930361|
|Fox News - EBDA_d...|263234313|
+--------------------+---------+
only showing top 20 rows



                                                                                

24/10/18 21:51:41 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 908080 ms exceeds timeout 120000 ms
24/10/18 21:51:41 WARN SparkContext: Killing executors is not supported by current scheduler.
24/10/18 21:51:44 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$