In [None]:
import re
from datetime import datetime
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import lit
import pyspark.sql.functions as functions
from pyspark.sql.functions import col
from pyspark import SparkContext, SparkConf, SQLContext

In [None]:
# Iniciar spark context
conf = SparkConf().setMaster('local[*]')
sc = SparkContext().getOrCreate(conf)
sqlc =  SQLContext(sc)

In [None]:
#Função que retorna String a partir de uma data (ordinal)
def str_date(input_date):
        try:
            return datetime.fromordinal(input_date).strftime('%d-%m-%Y')
        except Exception as e:
            return '01/01/1900'

In [347]:
#Function to parse data from pageviews file
def parseLog(data):
        ''' Read and parse log data '''
        RE_MASK = '(.*) - \[(.*)\] "(.*)" ([0-9]*) (.*) \| (.*): (.*) \| (.*): (http://www.facebook.com|http://google.com.br)'

        try:
            re_result = re.compile(RE_MASK).match(data)
            host = re_result.group(1)
            dateview = datetime.strptime(re_result.group(2),"%Y-%m-%d %H:%M:%S")
            page_url = re_result.group(5)
            device_id = str(re_result.group(7))
            referer = str(re_result.group(9))
            
            try:
                if referer=="http://www.facebook.com":
                    campaign_id = re.search('campaign_id=?([0-9]*)', re_result.group(5)).group(1)
                elif referer=="http://google.com.br":
                    campaign_id = re.search('campaign_id=([0-9]*)', re_result.group(5)).group(1)
            
                campaign_id = int(campaign_id)

            except AttributeError as e:
                campaign_id = None

            try:
                if referer=="http://www.facebook.com":
                    ad_creative_id=None
                elif referer=="http://google.com.br":
                    ad_creative_id = re.search('ad_creative_id=([0-9]*)', re_result.group(5)).group(1)
                    ad_creative_id = int(ad_creative_id)
        
            except AttributeError as e:
                ad_creative_id = None
        
            return host, dateview, page_url, device_id, referer, campaign_id, ad_creative_id
        
        except Exception as e:
            return '', -1, '', -1, -1

In [None]:
# Declaração do Schema de dados a ser utilizado - Pageview
pageview_schema = StructType([StructField('host',StringType(), True),StructField('dateview',TimestampType(), True),StructField('page_url',StringType(), True),StructField('device_id',StringType(), True), StructField('referer',StringType(), True), StructField('campaign_id',StringType(), True), StructField('ad_creative_id',IntegerType(), True)])

In [None]:
# Create dataframes - Facebook
facebook_ads = sqlc.read.json('../data/datasets/facebook_ads_media_costs.jsonl').select(col('clicks'), col('cost'), col('date'), col('facebook_campaign_id').alias('campaign_id'), col('facebook_campaign_name').alias('campaign_name'), col('impressions'))
facebook_ads= facebook_ads.withColumn('ad_creative_id', lit(0)).withColumn('ad_creative_name', lit(None).cast(StringType())).withColumn('source', lit(str('facebook')))

# Create dataframes - google
google_ads = sqlc.read.json('../data/datasets/google_ads_media_costs.jsonl').select(col('clicks'), col('cost'), col('date'), col('google_campaign_id').alias('campaign_id'), col('google_campaign_name').alias('campaign_name'), col('impressions'), col('ad_creative_id'), col('ad_creative_name'))
google_ads = google_ads.withColumn('source', lit(str('google')))

# Merge Google and Facebook into a unique Dataframe
media_ads = google_ads.union(facebook_ads)

In [None]:
# Create Schema for Customer Leads
schema = StructType([StructField("device_id", StringType(), True), StructField("lead_id", IntegerType(), True), StructField("registered_at", TimestampType(), True), StructField("credit_decision", StringType(), True), StructField("credit_decision_at", TimestampType(), True), StructField("signed_at", TimestampType(), True), StructField("revenue", FloatType(), True)])

# Create dataframe - Customer_leads
customer_leads = sqlc.read.load("../data/datasets/customer_leads_funnel.csv", header=False, format='com.databricks.spark.csv', schema=schema)

In [348]:
# Create Structure for pageviews and filter pages with referer google and facebook
pageviews_raw = sc.textFile('../data/datasets/pageview.txt')
pageviews_data = pageviews_raw.map(parseLog).filter(lambda x: x[1]!=-1)
pageviews_data.take(20)

[('169.252.180.145',
  datetime.datetime(2018, 10, 1, 0, 0),
  'http://www.creditas.com.br/emprestimo?campaign_id=3004',
  'g7DDoCqp9V',
  'http://www.facebook.com',
  3004,
  None),
 ('172.168.121.39',
  datetime.datetime(2018, 10, 1, 0, 0, 31),
  'http://www.creditas.com.br/emprestimo',
  'g7DDoCqp9V',
  'http://www.facebook.com',
  None,
  None),
 ('172.7.71.202',
  datetime.datetime(2018, 10, 1, 0, 5),
  'http://www.creditas.com.br/conversion?campaign_id=3002',
  '51h1W7wrS8',
  'http://www.facebook.com',
  3002,
  None),
 ('192.166.174.68',
  datetime.datetime(2018, 10, 1, 0, 5, 17),
  'http://www.creditas.com.br/sobre',
  '51h1W7wrS8',
  'http://www.facebook.com',
  None,
  None),
 ('169.242.90.99',
  datetime.datetime(2018, 10, 1, 0, 5, 54),
  'http://www.bankfacil.com.br/emprestimo-com-garantia',
  '51h1W7wrS8',
  'http://www.facebook.com',
  None,
  None),
 ('203.0.120.90',
  datetime.datetime(2018, 10, 1, 0, 5, 59),
  'http://www.creditas.com.br/conversion',
  '51h1W7wrS8',
 

In [349]:
pageviews = sqlc.createDataFrame(pageviews_data, pageview_schema)
pageviews.select('*').show(10, False)

+---------------+-------------------+---------------------------------------------------------------------------+----------+-----------------------+-----------+--------------+
|host           |dateview           |page_url                                                                   |device_id |referer                |campaign_id|ad_creative_id|
+---------------+-------------------+---------------------------------------------------------------------------+----------+-----------------------+-----------+--------------+
|169.252.180.145|2018-10-01 00:00:00|http://www.creditas.com.br/emprestimo?campaign_id=3004                     |g7DDoCqp9V|http://www.facebook.com|3004       |null          |
|172.168.121.39 |2018-10-01 00:00:31|http://www.creditas.com.br/emprestimo                                      |g7DDoCqp9V|http://www.facebook.com|null       |null          |
|172.7.71.202   |2018-10-01 00:05:00|http://www.creditas.com.br/conversion?campaign_id=3002                     |51h1W7w

In [None]:
RE_MASK = '(.*) - \[(.*)\] "(.*)" ([0-9]*) (.*) \| (.*): (.*) \| (.*): (http://www.facebook.com|http://google.com.br)'

re_result = re.compile(RE_MASK).match(test3)

re_result.groups()

data = datetime.strptime(re_result.group(2),"%Y-%m-%d %H:%M:%S")
print(type(data))