In [14]:
import re
from datetime import datetime
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import lit
import pyspark.sql.functions as functions
from pyspark.sql.functions import col
from pyspark import SparkContext, SparkConf, SQLContext

In [3]:
# Iniciar spark context
conf = SparkConf().setMaster('local[*]')
sc = SparkContext().getOrCreate(conf)
sqlc =  SQLContext(sc)

In [None]:
#Função que retorna String a partir de uma data (ordinal)
def str_date(input_date):
        try:
            return datetime.fromordinal(input_date).strftime('%d-%m-%Y')
        except Exception as e:
            return '01/01/1900'

In [223]:
# Função para fazer parse dos dados de entrada
def parseLog(data):
        ''' Read and parse log data '''
        RE_MASK = '(.*) - \[(.*)\] "(.*)" ([0-9]*) (.*) \| (.*): (.*) \| (.*): (http://www.facebook.com|http://google.com.br)'

        try:
            re_result = re.compile(RE_MASK).match(data)
            host = re_result.group(1)
            dateview = datetime.strptime(re_result.group(2), "%Y-%m-%d %H:%M:%S").date()
            page_url = re_result.group(5)
            device_id = str(re_result.group(7))
            referer = str(re_result.group(9))
            
            try:
                if referer=="http://www.facebook.com":
                    campaign_id = re.search('campaign_id=?([0-9]*)', re_result.group(5)).group(1)
                elif referer=="http://google.com.br":
                    campaign_id = re.search('campaign_id=([0-9]*)', re_result.group(5)).group(1)
            
            except AttributeError as e:
                campaign_id = 0

            try:
                if referer=="http://www.facebook.com":
                    ad_creative_id=0
                elif referer=="http://google.com.br":
                    ad_creative_id = re.search('ad_creative_id=([0-9]*)', re_result.group(5)).group(1)
        
            except AttributeError as e:
                ad_creative_id = 0
        
            return host, dateview, page_url, device_id, referer, campaign_id, ad_creative_id
        
        except Exception as e:
            return '', -1, '', -1, -1

In [224]:
RE_MASK = '(.*) - \[(.*)\] "(.*)" ([0-9]*) (.*) \| (.*): (.*) \| (.*): (http://www.facebook.com|http://google.com.br)'
m = re.compile(RE_MASK).match(test5)

print(m.groups())

# 169.252.180.145 - [2018-10-01 00:00:00] "GET / HTTP/1.1" 200 http://www.creditas.com.br/emprestimo?campaign_id=3004 | device_id: g7DDoCqp9V | referer: http://www.facebook.com

('100.43.243.32', '2018-10-01 02:45:40', 'GET / HTTP/1.1', '200', 'http://www.creditas.com.br', 'device_id', '6TZNq4IEXL', 'referer', 'http://google.com.br')


In [227]:
test2 = '172.168.121.39 - [2018-10-01 00:00:31] "GET / HTTP/1.1" 200 http://www.creditas.com.br/emprestimo | device_id: g7DDoCqp9V | referer: http://www.facebook.com'
test3 = '198.220.64.132 - [2018-10-01 00:20:00] "GET / HTTP/1.1" 200 http://www.creditas.com.br/conversion?ad_creative_id=20005&campaign_id=1004 | device_id: etkkhNRp99 | referer: http://google.com.br'
test4 = '203.3.193.9 - [2018-10-01 02:30:07] "GET / HTTP/1.1" 200 http://www.creditas.com.br/emprestimo | device_id: 0ORagCcwX1 | referer: http://google.com.br'
test5 = '100.43.243.32 - [2018-10-01 02:45:40] "GET / HTTP/1.1" 200 http://www.creditas.com.br | device_id: 6TZNq4IEXL | referer: http://google.com.br'
test6 = '169.250.30.255 - [2018-10-01 03:00:52] "GET / HTTP/1.1" 200 http://www.bankfacil.com.br/emprestimo-com-garantia | device_id: wGN1Jw6o5S | referer: https://teixeira.com/'

result = parseLog(test6)
for i in result:
    print(i)


-1

-1
-1


In [151]:
RE_MASK = '(.*) - \[(.*)\] "(.*)" ([0-9]*) (.*) \| (.*): (.*) \| (.*): (.*)'
m = re.compile(RE_MASK).match(test4)
#campaign = re.search('campaign_id=([0-9]*)', m.group(5)).group(1)


for i in range (1,10):
    print(m.group(i))

203.3.193.9
2018-10-01 02:30:07
GET / HTTP/1.1
200
http://www.creditas.com.br/emprestimo
device_id
0ORagCcwX1
referer
http://google.com.br


In [4]:
# Declaração do Schema de dados a ser utilizado
schema = StructType([StructField('host',StringType(), True),StructField('timestamp',IntegerType(), True),StructField('request',StringType(), True),StructField('http_code',IntegerType(), True), StructField('total_bytes',IntegerType(), True)])

In [5]:
# Create dataframes - Facebook
facebook_ads = sqlc.read.json('../data/datasets/facebook_ads_media_costs.jsonl').select(col('clicks'), col('cost'), col('date'), col('facebook_campaign_id').alias('campaign_id'), col('facebook_campaign_name').alias('campaign_name'), col('impressions'))
facebook_ads= facebook_ads.withColumn('ad_creative_id', lit(0)).withColumn('ad_creative_name', lit(None).cast(StringType())).withColumn('source', lit(str('facebook')))

# Create dataframes - google
google_ads = sqlc.read.json('../data/datasets/google_ads_media_costs.jsonl').select(col('clicks'), col('cost'), col('date'), col('google_campaign_id').alias('campaign_id'), col('google_campaign_name').alias('campaign_name'), col('impressions'), col('ad_creative_id'), col('ad_creative_name'))
google_ads = google_ads.withColumn('source', lit(str('google')))

# Merge Google and Facebook into a unique Dataframe
media_ads = google_ads.union(facebook_ads)

In [6]:
google_ads.select('*').show(2, True)

+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|clicks| cost|      date|campaign_id|       campaign_name|impressions|ad_creative_id|    ad_creative_name|source|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|3358.5|19.02|2018-10-01|       1001|emprestimo_garant...|     157767|         20001|homem_sorrindo_fu...|google|
|2071.0|30.62|2018-10-01|       1001|emprestimo_garant...|     176313|         20002|homem_sorrindo_fu...|google|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
only showing top 2 rows



In [7]:
# Create Schema for Customer Leads
schema = StructType([StructField("device_id", StringType(), True), StructField("lead_id", IntegerType(), True), StructField("registered_at", TimestampType(), True), StructField("credit_decision", StringType(), True), StructField("credit_decision_at", TimestampType(), True), StructField("signed_at", TimestampType(), True), StructField("revenue", FloatType(), True)])

# Create dataframe - Customer_leads
customer_leads = sqlc.read.load("../data/datasets/customer_leads_funnel.csv", header=False, format='com.databricks.spark.csv', schema=schema)

In [8]:
customer_leads.select('*').show(2, True)
customer_leads.select('*').where(col('signed_at').isNull()).show()

+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
| device_id|  lead_id|      registered_at|credit_decision| credit_decision_at|          signed_at|revenue|
+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
|1mJbSWeKdu|297280519|2018-10-01 01:32:37|              A|2018-10-04 07:40:37|2018-10-08 12:01:37| 215.43|
|06Of3vYqPw|507403293|2018-10-01 03:45:55|              A|2018-10-06 15:55:55|2018-10-09 09:58:55| 106.48|
+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
only showing top 2 rows

+----------+---------+--------------------+---------------+--------------------+---------+-------+
| device_id|  lead_id|       registered_at|credit_decision|  credit_decision_at|signed_at|revenue|
+----------+---------+--------------------+---------------+--------------------+---------+-------+
|qna4WIdKs6|570283707| 2018-10-01 04

In [9]:
customer_leads.show(2, True)

+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
| device_id|  lead_id|      registered_at|credit_decision| credit_decision_at|          signed_at|revenue|
+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
|1mJbSWeKdu|297280519|2018-10-01 01:32:37|              A|2018-10-04 07:40:37|2018-10-08 12:01:37| 215.43|
|06Of3vYqPw|507403293|2018-10-01 03:45:55|              A|2018-10-06 15:55:55|2018-10-09 09:58:55| 106.48|
+----------+---------+-------------------+---------------+-------------------+-------------------+-------+
only showing top 2 rows



In [10]:
media_ads.select('*').select("*").where(col('campaign_id')==1002).show(1, True)

+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|clicks| cost|      date|campaign_id|       campaign_name|impressions|ad_creative_id|    ad_creative_name|source|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|4545.0|16.19|2018-10-01|       1002|creditas|auto|nat...|     101916|         20001|homem_sorrindo_fu...|google|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
only showing top 1 row



In [12]:
media_ads.select('*').show(2, True)

+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|clicks| cost|      date|campaign_id|       campaign_name|impressions|ad_creative_id|    ad_creative_name|source|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
|3358.5|19.02|2018-10-01|       1001|emprestimo_garant...|     157767|         20001|homem_sorrindo_fu...|google|
|2071.0|30.62|2018-10-01|       1001|emprestimo_garant...|     176313|         20002|homem_sorrindo_fu...|google|
+------+-----+----------+-----------+--------------------+-----------+--------------+--------------------+------+
only showing top 2 rows



In [13]:
# Create Structure for pageviews
pageviews = sc.textFile('../data/datasets/pageview.txt')
test = pageviews.first()
print(test)

169.252.180.145 - [2018-10-01 00:00:00] "GET / HTTP/1.1" 200 http://www.creditas.com.br/emprestimo?campaign_id=3004 | device_id: g7DDoCqp9V | referer: http://www.facebook.com


In [15]:
RE_MASK = '(.*) - \[(.*)\] "(.*)" ([0-9]*) (.*) \| (.*): (.*) \| (.*): (.*)'

m = re.compile(RE_MASK).match(test)

campaign = re.search('campaign_id=?([0-9]*)', m.group(5))

#print(campaign.group(1))

for i in range (1,10):
    print(m.group(i))

169.252.180.145
2018-10-01 00:00:00
GET / HTTP/1.1
200
http://www.creditas.com.br/emprestimo?campaign_id=3004
device_id
g7DDoCqp9V
referer
http://www.facebook.com


In [None]:
 RE_MASK = '(.*) - - \[(.*):(.*):(.*):(.*)\] "(.*)" ([0-9]*) ([0-9]*|-)'