In [1]:
import findspark
findspark.init() 

In [2]:
import os
import time
import datetime
import pyspark.sql.functions as sf
from uuid import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql.functions import lit
from pyspark import SparkConf, SparkContext
from uuid import * 
from uuid import UUID
import time_uuid 
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window as W

In [3]:
spark = SparkSession.builder.config("spark.jars.packages",'com.datastax.spark:spark-cassandra-connector_2.12:3.1.0').getOrCreate()

In [4]:
#Read data from Cassandra
df = spark.read.format("org.apache.spark.sql.cassandra").options(keyspace="keyspace_de",table="tracking").load()

In [5]:
df.select('custom_track').distinct().show()

+-------------------+
|       custom_track|
+-------------------+
|         conversion|
|               null|
|interview_scheduled|
|              click|
|        unqualified|
|              alive|
|           redirect|
|          qualified|
|         superpixel|
+-------------------+



In [6]:
df = df.select('create_time','bid','campaign_id','custom_track','group_id','job_id','publisher_id')

In [7]:
def process_timeuuid(df):
    spark_time = df.select('create_time').collect()
    normal_timestamp = []
    for i in range(len(spark_time)):
        normal_timestamp.append(str(time_uuid.TimeUUID(bytes=UUID(spark_time[i][0]).bytes).get_datetime().strftime('%Y-%m-%d %H:%M:%S')))
    normal_timestamp
    spark_timeuuid = []
    for i in range(len(spark_time)):
        spark_timeuuid.append(spark_time[i][0])
    time_data = spark.createDataFrame(zip(spark_timeuuid,normal_timestamp),['create_time','ts'])
    result = df.join(time_data,df.create_time==time_data.create_time,'inner').drop(df.create_time)
    result = result.select('create_time','ts','bid','campaign_id','custom_track','group_id','job_id','publisher_id')
    return result

In [8]:
processed_df = process_timeuuid(df)

In [9]:
processed_df.show()

+--------------------+-------------------+----+-----------+------------+--------+------+------------+
|         create_time|                 ts| bid|campaign_id|custom_track|group_id|job_id|publisher_id|
+--------------------+-------------------+----+-----------+------------+--------+------+------------+
|756bdd00-0d63-11e...|2022-07-27 04:20:34|   0|        222|        null|    null|  1532|           1|
|3fd449c0-0b5b-11e...|2022-07-24 14:16:45|   1|         48|        null|      34|   188|           1|
|3f796600-0bff-11e...|2022-07-25 09:50:42|   0|        222|        null|    null|  1533|           1|
|3cdd1b90-02d0-11e...|2022-07-13 17:21:31|null|       null|        null|    null|  null|        null|
|6140b730-0668-11e...|2022-07-18 07:08:09|null|       null|       click|    null|  null|        null|
|a0b45bd0-0c8d-11e...|2022-07-26 02:49:54|null|       null|       alive|    null|  null|        null|
|e1a01b60-0847-11e...|2022-07-20 16:20:33|null|       null|       click|    null| 

In [10]:
def calculating_clicks(processed_df):
    clicks_data = processed_df.filter(processed_df.custom_track == 'click')
    clicks_data = clicks_data.na.fill({'bid': 0})
    clicks_data = clicks_data.na.fill({'publisher_id': 0})
    clicks_data = clicks_data.na.fill({'job_id': 0})
    clicks_data = clicks_data.na.fill({'group_id': 0})
    clicks_data = clicks_data.na.fill({'campaign_id': 0})
    clicks_data.createOrReplaceTempView('clicks_data')
    clicks_output = spark.sql(""" with cte1 as (select create_time,bid,job_id,group_id,campaign_id,publisher_id,date(ts) as dates,hour(ts) as hours from clicks_data) 
    select job_id,group_id,campaign_id,publisher_id,dates,hours,sum(bid) as spend_hour, round(avg(bid),2) as bid_set,count(create_time) as clicks 
    from cte1 
    group by publisher_id,job_id,group_id,campaign_id,dates,hours""")
    return clicks_output 

In [11]:
def calculating_conversion(processed_df):
    conversion_data = processed_df.filter(processed_df.custom_track == 'conversion')
    conversion_data = conversion_data.na.fill({'bid': 0})
    conversion_data = conversion_data.na.fill({'publisher_id': 0})
    conversion_data = conversion_data.na.fill({'job_id': 0})
    conversion_data = conversion_data.na.fill({'group_id': 0})
    conversion_data = conversion_data.na.fill({'campaign_id': 0})
    conversion_data.createOrReplaceTempView('conversion_data')
    conversion_output = spark.sql(""" with cte1 as (select create_time,bid,job_id,group_id,campaign_id,publisher_id,date(ts) as dates,hour(ts) as hours from conversion_data) 
    select job_id,group_id,campaign_id,publisher_id,dates,hours,sum(bid) as spend_hour, round(avg(bid),2) as bid_set,count(create_time) as conversion 
    from cte1 
    group by publisher_id,job_id,group_id,campaign_id,dates,hours""")
    return conversion_output 

In [12]:
def calculating_qualified(processed_df):
    qualified_data = processed_df.filter(processed_df.custom_track == 'qualified')
    qualified_data = qualified_data.na.fill({'bid': 0})
    qualified_data = qualified_data.na.fill({'publisher_id': 0})
    qualified_data = qualified_data.na.fill({'job_id': 0})
    qualified_data = qualified_data.na.fill({'group_id': 0})
    qualified_data = qualified_data.na.fill({'campaign_id': 0})
    qualified_data.createOrReplaceTempView('qualified_data')
    qualified_output = spark.sql(""" with cte1 as (select create_time,bid,job_id,group_id,campaign_id,publisher_id,date(ts) as dates,hour(ts) as hours from qualified_data) 
    select job_id,group_id,campaign_id,publisher_id,dates,hours,sum(bid) as spend_hour, round(avg(bid),2) as bid_set,count(create_time) as qualified_application 
    from cte1 
    group by publisher_id,job_id,group_id,campaign_id,dates,hours""")
    return qualified_output 

In [13]:
def calculating_unqualified(processed_df):
    unqualified_data = processed_df.filter(processed_df.custom_track == 'qualified')
    unqualified_data = unqualified_data.na.fill({'bid': 0})
    unqualified_data = unqualified_data.na.fill({'publisher_id': 0})
    unqualified_data = unqualified_data.na.fill({'job_id': 0})
    unqualified_data = unqualified_data.na.fill({'group_id': 0})
    unqualified_data = unqualified_data.na.fill({'campaign_id': 0})
    unqualified_data.createOrReplaceTempView('unqualified_data')
    unqualified_output = spark.sql(""" with cte1 as (select create_time,bid,job_id,group_id,campaign_id,publisher_id,date(ts) as dates,hour(ts) as hours from unqualified_data) 
    select job_id,group_id,campaign_id,publisher_id,dates,hours,sum(bid) as spend_hour, round(avg(bid),2) as bid_set,count(create_time) as disqualified_application 
    from cte1 
    group by publisher_id,job_id,group_id,campaign_id,dates,hours""")
    return unqualified_output 

In [14]:
def processing_cassandra_output(processed_df):
    clicks_output = calculating_clicks(processed_df)
    conversion_output = calculating_conversion(processed_df)
    qualified_output = calculating_qualified(processed_df)
    unqualified_output = calculating_unqualified(processed_df)
    cassandra_output = clicks_output.join(conversion_output,['job_id', 'group_id', 'campaign_id', 'publisher_id', 'dates', 'hours'],'full').join(qualified_output,['job_id', 'group_id', 'campaign_id', 'publisher_id', 'dates', 'hours'],'full').join(unqualified_output,['job_id', 'group_id', 'campaign_id', 'publisher_id', 'dates', 'hours'],'full').drop(conversion_output.job_id).drop(conversion_output.group_id).drop(conversion_output.campaign_id).drop(conversion_output.publisher_id)\
    .drop(conversion_output.dates).drop(conversion_output.hours).drop(conversion_output.spend_hour).drop(conversion_output.bid_set).drop(qualified_output.bid_set).drop(qualified_output.job_id).drop(qualified_output.group_id).drop(qualified_output.campaign_id).drop(qualified_output.publisher_id).drop(qualified_output.dates).drop(qualified_output.hours).drop(qualified_output.spend_hour).drop(qualified_output.bid_set).drop(unqualified_output.job_id).drop(unqualified_output.group_id).drop(unqualified_output.campaign_id).drop(unqualified_output.publisher_id).drop(unqualified_output.dates).drop(unqualified_output.hours).drop(unqualified_output.spend_hour).drop(unqualified_output.bid_set)
    return cassandra_output


In [15]:
cassandra_output = processing_cassandra_output(processed_df)

In [16]:
cassandra_output.show()

+------+--------+-----------+------------+----------+-----+----------+-------+------+----------+---------------------+------------------------+
|job_id|group_id|campaign_id|publisher_id|     dates|hours|spend_hour|bid_set|clicks|conversion|qualified_application|disqualified_application|
+------+--------+-----------+------------+----------+-----+----------+-------+------+----------+---------------------+------------------------+
|     0|       0|          0|           0|2022-07-06|    9|         0|    0.0|     1|      null|                 null|                    null|
|     0|       0|          0|           0|2022-07-06|   15|      null|   null|  null|         2|                 null|                    null|
|     0|       0|          0|           0|2022-07-07|    2|         0|    0.0|     3|      null|                 null|                    null|
|     0|       0|          0|           0|2022-07-07|    3|         0|    0.0|     2|      null|                 null|                  

In [17]:
cassandra_output.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- group_id: integer (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)
 |-- dates: date (nullable = true)
 |-- hours: integer (nullable = true)
 |-- spend_hour: long (nullable = true)
 |-- bid_set: double (nullable = true)
 |-- clicks: long (nullable = true)
 |-- conversion: long (nullable = true)
 |-- qualified_application: long (nullable = true)
 |-- disqualified_application: long (nullable = true)



In [18]:
def processing_remain_column(cassandra_output):
    cassandra_output = cassandra_output.withColumn('impressions',lit(0))
    cassandra_output = cassandra_output.withColumn('sources',lit('Cassandra'))
    #Read data from MySQL 
    host = 'localhost'
    port = '3306'
    db_name = 'Data_Warehouse'
    user = 'root'
    password = '1'
    url = 'jdbc:mysql://' + host + ':' + port + '/' + db_name
    driver = "com.mysql.cj.jdbc.Driver"
    sql = """(select * from company) a"""
    mysql_df_company = spark.read.format('jdbc').options(url=url, driver=driver, dbtable=sql, user=user, password=password).load()
    cassandra_output_offcial = cassandra_output.join(mysql_df_company,cassandra_output.publisher_id==mysql_df_company.publisher_id,'inner').drop(mysql_df_company.publisher_id)
    cassandra_output_offcial = cassandra_output_offcial.select('job_id','dates','hours','disqualified_application','qualified_application','conversion','id','group_id','campaign_id','publisher_id','bid_set','clicks','impressions','spend_hour','sources')
    cassandra_output_offcial = cassandra_output_offcial.na.fill({'disqualified_application': 0})
    cassandra_output_offcial = cassandra_output_offcial.na.fill({'qualified_application': 0})
    cassandra_output_offcial = cassandra_output_offcial.na.fill({'conversion': 0})
    cassandra_output_offcial = cassandra_output_offcial.withColumnRenamed("id","company_id")
    cassandra_output_offcial = cassandra_output_offcial.withColumn('id',lit(0))
    return cassandra_output_offcial


In [19]:
cassandra_output_offcial = processing_remain_column(cassandra_output)

In [20]:
cassandra_output_offcial.show()

+------+----------+-----+------------------------+---------------------+----------+----------+--------+-----------+------------+-------+------+-----------+----------+---------+---+
|job_id|     dates|hours|disqualified_application|qualified_application|conversion|company_id|group_id|campaign_id|publisher_id|bid_set|clicks|impressions|spend_hour|  sources| id|
+------+----------+-----+------------------------+---------------------+----------+----------+--------+-----------+------------+-------+------+-----------+----------+---------+---+
|  1534|2022-07-27|    4|                       0|                    0|         0|        59|       0|        222|           1|    0.0|    11|          0|         0|Cassandra|  0|
|  1534|2022-07-26|    7|                       0|                    0|         0|        59|       0|        222|           1|    0.0|     5|          0|         0|Cassandra|  0|
|  1534|2022-07-26|    3|                       0|                    0|         0|        59| 

In [21]:
cassandra_output_offcial.count()

62

In [74]:
def load_to_datawarehouse(cassandra_output_offcial):
    url = 'jdbc:mysql://' + 'localhost' + ':' + '3306' + '/' + 'Data_Warehouse'
    driver = "com.mysql.cj.jdbc.Driver"
    user = 'root'
    password = '1'
    cassandra_output_offcial.write.format('jdbc').option('url',url).option('driver',driver).option('dbtable','events').option('user',user).option('password',password).mode('append').save()

In [None]:
load_to_DW = load_to_datawarehouse(cassandra_output_offcial)