In [1]:
import pandas as pd
import pprint
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import FloatType, BooleanType, StructField, StructType, DoubleType, ArrayType
import pickle
import math
import time

import os
from dotenv import load_dotenv
load_dotenv()

AWS_ENDPOINT_URL = os.getenv('AWS_ENDPOINT_URL')
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')

In [2]:
conf = SparkConf().setAppName("Spark com S3").setMaster("local[*]")

conf.set("spark.driver.memory", "70g")
conf.set("spark.executor.memory", "70g")
conf.set("spark.executor.pyspark.memory", "70g")

# conf.set("spark.driver.cores", "20")
# conf.set("spark.executor.cores", "20")

# conf.set("spark.memory.offHeap.enabled", "true")
# conf.set("spark.memory.offHeap.size", "20g")

# conf.set("spark.sql.shuffle.partitions", "2000")
# conf.set("spark.sql.parquet.columnarReaderBatchSize", "2048") 
conf.set("spark.sql.parquet.enableVectorizedReader", "false")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.sql.repl.eagerEval.enabled", "true")
conf.set("spark.sql.repl.eagerEval.truncate", 100)

conf.set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", AWS_ENDPOINT_URL)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

:: loading settings :: url = jar:file:/home/darrazao/git/accounting_website_classifier/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/darrazao/.ivy2/cache
The jars for the packages stored in: /home/darrazao/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-14f9ab8c-d601-4a87-935e-b1b141e919dd;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 171ms :: artifacts dl 7ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------------

In [None]:
df_ecomm = spark.read.parquet('../data/countries_filtered_with_predictions')

                                                                                

In [4]:
df_ecomm.printSchema()

root
 |-- domain: string (nullable = true)
 |-- html: string (nullable = true)
 |-- probability: double (nullable = true)
 |-- prediction: boolean (nullable = true)



In [5]:
df_ecomm.count() # 2.823.779

                                                                                

2823779

In [6]:
# df_ecomm = df_ecomm.drop_duplicates()
# df_ecomm.count() # 2.823.779 -> 2.823.774
# 2.823.774

In [7]:
value_counts_df = df_ecomm.groupBy('prediction').count()

In [8]:
value_counts_df.show()
# +----------+-------+
# |prediction|  count|
# +----------+-------+
# |      true| 283897|
# |     false|2539882|
# +----------+-------+

# +----------+-------+
# |prediction|  count|
# +----------+-------+
# |      true| 400261|
# |     false|2423513|
# +----------+-------+

                                                                                

+----------+-------+
|prediction|  count|
+----------+-------+
|      true| 400262|
|     false|2423517|
+----------+-------+



In [9]:
df_filtered = df_ecomm.filter(df_ecomm.prediction == True)

In [None]:
file_path = '../data/countries_filtered_true_predictions'
df_filtered.write.parquet(file_path, mode='overwrite')

24/06/26 17:01:25 WARN InternalParquetRecordWriter: Too much memory used: Store {
 [domain] optional binary domain (STRING) {
  r:0 bytes
  d:0 bytes
   data: FallbackValuesWriter{
   data: initial: DictionaryValuesWriter{
   data: initial: dict:2279
   data: initial: values:392
   data: initial:}

   data: fallback: PLAIN CapacityByteArrayOutputStream 0 slabs, 0 bytes
   data:}

   pages: ColumnChunkPageWriter ConcatenatingByteArrayCollector 0 slabs, 0 bytes
   total: 2,337/2,671
 }
 [html] optional binary html (STRING) {
  r:0 bytes
  d:0 bytes
   data: FallbackValuesWriter{
   data: initial: DictionaryValuesWriter{
   data: initial: dict:0
   data: initial: values:0
   data: initial:}

   data: fallback: PLAIN CapacityByteArrayOutputStream 87 slabs, 786,818,990 bytes
   data:}

   pages: ColumnChunkPageWriter ConcatenatingByteArrayCollector 0 slabs, 0 bytes
   total: 786,699,968/786,818,990
 }
 [prediction] optional boolean prediction {
  r:0 bytes
  d:0 bytes
   data: ByteBitPackin

In [11]:
df_filtered.show()

                                                                                

+--------------------+--------------------+------------------+----------+
|              domain|                html|       probability|prediction|
+--------------------+--------------------+------------------+----------+
|rosalianazareth.c...|\r\n\r\n\r\n<!DOC...|0.5822495765579052|      true|
| rosanessence.com.br|<!DOCTYPE html>\n...|0.5174181802464494|      true|
|rosiatacadodaling...|﻿\r\n<!DOCTYPE ht...|0.8780214533141218|      true|
|rotulooficial.com.br|\n<!DOCTYPE html>...|0.9058691906324399|      true|
|     rvmoveis.com.br|<!DOCTYPE html>\n...|0.6218398505033171|      true|
|        altai.com.br|<!doctype html>\n...|0.8942354234584148|      true|
|alternativafotopr...|<!DOCTYPE html>\n...|0.9292447633072041|      true|
|alumiareducacao.c...|<!DOCTYPE html><h...|0.8060824719290467|      true|
|aluminioglobo.com.br|  <!DOCTYPE html>...|0.5912571962045025|      true|
|     alvartes.com.br|\t<!DOCTYPE html>...|0.8233993621603585|      true|
| amarelostore.com.br|<!doctype html><

In [11]:
df_true_ecomm_smaller = df_filtered.select('domain', 'probability')

In [12]:
df_true_ecomm_smaller.count()

                                                                                

400262

In [18]:
from pyspark.sql.functions import col, max

df = df_true_ecomm_smaller.withColumn("probability", col("probability").cast("float"))

# Encontrar a probabilidade máxima para cada domínio
max_prob_df = df.groupBy("domain").agg(max(col("probability")).alias("max_probability"))

# Usar alias para os DataFrames
df_alias = df.alias("df")
max_prob_df_alias = max_prob_df.alias("max_prob_df")

# Juntar o DataFrame original com o DataFrame de probabilidades máximas
result_df = df_alias.join(
    max_prob_df_alias, 
    (df_alias["domain"] == max_prob_df_alias["domain"]) & (df_alias["probability"] == max_prob_df_alias["max_probability"])
).select(df_alias["domain"], df_alias["probability"])

In [22]:
result_df.count()

                                                                                

357297

In [21]:
result_df.show()

                                                                                

+--------------------+-----------+
|              domain|probability|
+--------------------+-----------+
|   rvicapital.com.br| 0.71908313|
|rotulooficial.com.br| 0.99449414|
|alvoradaimoveis.i...|  0.8781451|
|ruysantosimoveis....|   0.803302|
| rosanessence.com.br| 0.82560474|
|        altai.com.br| 0.99179375|
|   ruidorosa.blog.br|  0.5045915|
|rosalianazareth.c...|  0.9682496|
|rosiatacadodaling...|  0.8256773|
|    ammojoias.com.br| 0.96852535|
|     alvartes.com.br| 0.99222636|
| amarelostore.com.br|  0.9992821|
|   amodecorar.com.br| 0.75037354|
|amorelielingerie....| 0.56768394|
|aluminioglobo.com.br|  0.7200637|
|americansportshop...|  0.9993049|
|   aluguenote.com.br|  0.6484361|
|   aluguenote.com.br|  0.6484361|
|        rprai.com.br| 0.79970986|
|   altasdicas.com.br| 0.55912626|
+--------------------+-----------+
only showing top 20 rows



In [23]:
df_true_ecomm_smaller.write.mode("overwrite").save('s3a://drivalake/sites/silver/ecomm_true_predictions_simple_v2/')

                                                                                

In [24]:
spark.stop()

In [None]:
# df_true_ecomm = spark.read.parquet('./data/countries_filtered_true_predictions')

                                                                                

In [None]:
# df_true_ecomm.count()

                                                                                

283897

In [None]:
# df_true_ecomm.write.mode("overwrite").save('s3a://drivalake/sites/silver/ecomm_true_predictions/')

24/06/14 20:46:03 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

In [None]:
# spark.stop()