In [40]:
import re
from datetime import datetime
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import lit
import pyspark.sql.functions as functions
from pyspark.sql.functions import col
from pyspark import SparkContext, SparkConf, SQLContext

In [2]:
# Iniciar spark context
conf = SparkConf().setMaster('local[*]')
sc = SparkContext().getOrCreate(conf)
sqlc =  SQLContext(sc)

In [3]:
#Função que retorna String a partir de uma data (ordinal)
def str_date(input_date):
        try:
            return datetime.fromordinal(input_date).strftime('%d-%m-%Y')
        except Exception as e:
            return '01/01/1900'

In [4]:
# Função para fazer parse dos dados de entrada
def parseLog(data):
        ''' Read and parse log data '''
        RE_MASK = '(.*) - - \[(.*):(.*):(.*):(.*)\] "(.*)" ([0-9]*) ([0-9]*|-)'

        try:
            re_result = re.compile(RE_MASK).match(data)
            host = re_result.group(1)
            ord_day = datetime.strptime(re_result.group(2), '%d/%b/%Y').toordinal()
            req = re_result.group(6)
            reply_code = int(re_result.group(7))
            
            try:
                reply_bytes = int(re_result.group(8))
            except ValueError as e:
                reply_bytes = 0
            return host, ord_day, req, reply_code, reply_bytes
        
        except Exception as e:
            return '', -1, '', -1, -1

In [5]:
# Declaração do Schema de dados a ser utilizado
schema = StructType([StructField('host',StringType(), True),StructField('timestamp',IntegerType(), True),StructField('request',StringType(), True),StructField('http_code',IntegerType(), True), StructField('total_bytes',IntegerType(), True)])

In [8]:
import os
os.getcwdb()

b'/Users/denniscardoso/Projects/cred_test/creditas_engineer/notebook'

In [82]:
# Create dataframes - Facebook
facebook_ads = sqlc.read.json('../data/datasets/facebook_ads_media_costs.jsonl').select(col('clicks'), col('cost'), col('date'), col('facebook_campaign_id').alias('campaign_id'), col('facebook_campaign_name').alias('campaign_name'), col('impressions'))
facebook_ads= facebook_ads.withColumn('ad_creative_id', lit(0)).withColumn('ad_creative_name', lit(None).cast(StringType())).withColumn('source', lit(str('facebook')))

# Create dataframes - google
google_ads = sqlc.read.json('../data/datasets/google_ads_media_costs.jsonl').select(col('clicks'), col('cost'), col('date'), col('google_campaign_id').alias('campaign_id'), col('google_campaign_name').alias('campaign_name'), col('impressions'), col('ad_creative_id'), col('ad_creative_name'))
google_ads = google_ads.withColumn('source', lit(str('google')))

# Merge Google and Facebook into a unique Dataframe
media_ads = google_ads.union(facebook_ads)

In [52]:
google_ads.select('*').show(2, True)

+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|clicks| cost|      date|campaign_id|       campaign_name|impressions|ad_creative_id|    ad_creative_name|source|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|3358.5|19.02|2018-10-01|       1001|emprestimo_garant...|     157767|         20001|homem_sorrindo_fu...|google|
|2071.0|30.62|2018-10-01|       1001|emprestimo_garant...|     176313|         20002|homem_sorrindo_fu...|google|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
only showing top 2 rows



In [89]:
# Create Schema for Customer Leads
schema = StructType([StructField("device_id", StringType(), True), StructField("lead_id", IntegerType(), True), StructField("registered_at", TimestampType(), True), StructField("credit_decision", StringType(), True), StructField("credit_decision_at", TimestampType(), True), StructField("signed_at", TimestampType(), True), StructField("revenue", FloatType(), True)])

# Create dataframe - Customer_leads
customer_leads = sqlc.read.load("../data/datasets/customer_leads_funnel.csv", header=False, format='com.databricks.spark.csv', schema=schema)

In [91]:
customer_leads.select('*').show(2, True)
customer_leads.printSchema()

+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
| device_id|  lead_id|      registered_at|credit_decision| credit_decision_at|          signed_at|revenue|
+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
|1mJbSWeKdu|297280519|2018-10-01 01:32:37|              A|2018-10-04 07:40:37|2018-10-08 12:01:37| 215.43|
|06Of3vYqPw|507403293|2018-10-01 03:45:55|              A|2018-10-06 15:55:55|2018-10-09 09:58:55| 106.48|
+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
only showing top 2 rows

root
 |-- device_id: string (nullable = true)
 |-- lead_id: integer (nullable = true)
 |-- registered_at: timestamp (nullable = true)
 |-- credit_decision: string (nullable = true)
 |-- credit_decision_at: timestamp (nullable = true)
 |-- signed_at: timestamp (nullable = true)
 |-- revenue: float (nullable = true)



In [32]:
customer_leads.show(2, True)

+----------+---------+-------------------+---+-------------------+-------------------+------+
|       _c0|      _c1|                _c2|_c3|                _c4|                _c5|   _c6|
+----------+---------+-------------------+---+-------------------+-------------------+------+
|1mJbSWeKdu|297280519|2018-10-01 01:32:37|  A|2018-10-04 07:40:37|2018-10-08 12:01:37|215.43|
|06Of3vYqPw|507403293|2018-10-01 03:45:55|  A|2018-10-06 15:55:55|2018-10-09 09:58:55|106.48|
+----------+---------+-------------------+---+-------------------+-------------------+------+
only showing top 2 rows



In [78]:
media_ads.select('*').select("*").where(col('campaign_id')==1002).show(1, True)

+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|clicks| cost|      date|campaign_id|       campaign_name|impressions|ad_creative_id|    ad_creative_name|source|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|4545.0|16.19|2018-10-01|       1002|creditas|auto|nat...|     101916|         20001|homem_sorrindo_fu...|google|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
only showing top 1 row



In [85]:
customer_leads.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: timestamp (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: timestamp (nullable = true)
 |-- _c5: timestamp (nullable = true)
 |-- _c6: double (nullable = true)

