In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window, SQLContext
from pyspark.sql.types import IntegerType, StringType

import pyspark.sql.functions as F
import pandas as pd

In [None]:
conf = SparkConf().setAppName("App").setMaster("local[*]")

# Habilitar otimizações e configurações adicionais
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.sql.repl.eagerEval.enabled", "true")
conf.set("spark.sql.repl.eagerEval.truncate", 100)
conf.set("spark.sql.execution.arrow.pyspark.ignore_timezone", "true")

# AWS S3 CONNECTION
AWS_ACCESS_KEY = ""
AWS_SECRET_KEY = ""
AWS_ENDPOINT_URL = ""
AWS_REGION = "bhs"

conf.set("spark.driver.memory", "30g")
conf.set("spark.executor.memory", "30g")
conf.set("spark.executor.pyspark.memory", "30g")
conf.set("spark.memory.offHeap.enabled", "true")
conf.set("spark.memory.offHeap.size", "30g")
conf.set("spark.sql.parquet.enableVectorizedReader", "false")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.sql.repl.eagerEval.enabled", "true")
conf.set("spark.sql.repl.eagerEval.truncate", 100)
# conf.set("spark.jars", "/home/shared/drivers/postgresql-42.7.2.jar")
conf.set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", AWS_ENDPOINT_URL)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
# conf.set("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35")
# conf.set("spark.driver.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35")
# conf.set("spark.executor.extraJavaOptions", "-Djavax.net.debug=all")
# conf.set("spark.driver.extraJavaOptions", "-Djavax.net.debug=all")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

print(spark.sparkContext.getConf().get("spark.jars"))
print(spark._jsc.sc().listJars())

print("Spark session configurada com sucesso!")

In [None]:
ecommerce_table_df = spark.read.format("csv").option("header", "true").load("/media/greca/HD/Driva/ecommerces_202504281706.csv")
ecommerce_table_df.show()

In [None]:
pdf = pd.read_excel("../data/Base Segmentação_iugu_250417.xlsx", sheet_name="Base enriquecida")
original_df = spark.createDataFrame(pdf)
original_df = original_df.select("cnpj", "Nome da empresa", "Nicho tech", "Segmento iugu")
original_df = original_df.withColumn("raiz_cnpj", F.substring("cnpj", 1, 8))
original_df.show()

In [None]:
df = original_df.join(ecommerce_table_df, on="raiz_cnpj", how="inner")
df = df.select("raiz_cnpj", "cnpj", "Nome da empresa", "Nicho tech", "Segmento iugu", "host", "dominio")
df = df.dropDuplicates(subset=["raiz_cnpj", "cnpj", "Nome da empresa", "dominio"])
df.show(40)

In [None]:
only_suno = df.filter((
    (col("cnpj") == 26228525000172) & (col("dominio") == "suno.com.br")
))
df = df.filter(
    ~(
        (col("cnpj") == 26228525000172)
    )) # excluding all 'suno' rows 
df = df.union(only_suno)
df = df.select("dominio", "host", "raiz_cnpj", "cnpj", "Nome da empresa", "Nicho tech", "Segmento iugu")
df = df.dropDuplicates(subset=["dominio", "host", "Nome da empresa"])
df = df.withColumn("dominio", F.trim("dominio"))
df = df.withColumn("host", F.trim("host"))
df.show(50)

In [None]:
# only_suno = df.filter((
#     (col("cnpj") == 23903417000160) & (col("dominio") == "treeunfe.com.br") |
#     (col("cnpj") == 23055665000106) & (col("dominio") == "monetizze.com.br") |
#     (col("cnpj") == 45954282000102) & (col("dominio") == "principia.net") |
#     (col("cnpj") == 49612580000167) & (col("dominio") == "kalyst.com.br") |
#     (col("cnpj") == 56228263000107) & (col("dominio") == "hardpaygateway.com.br")
# ))
# df = df.filter(
#     ~(
#         (col("cnpj") == 23903417000160) |
#         (col("cnpj") == 23055665000106) |
#         (col("cnpj") == 29524877000163) |
#         (col("cnpj") == 38733648002789) |
#         (col("cnpj") == 45954282000102) |
#         (col("cnpj") == 47220889000186) |
#         (col("cnpj") == 49612580000167) |
#         (col("cnpj") == 56228263000107) |
#         (col("cnpj") == 35550699000148) |
#         (col("cnpj") == 19609281000102)
#     )) # excluding all 'suno' rows 
# df = df.union(only_suno)
# df = df.select("dominio", "host", "raiz_cnpj", "cnpj", "nome")
# df = df.dropDuplicates(subset=["dominio", "host", "nome"])
# df.show(50)

In [None]:
companies_found = df.select("raiz_cnpj").distinct().collect()
companies_found = [cf.raiz_cnpj for cf in companies_found]

remainder_domains = original_df.filter(~col("raiz_cnpj").isin(companies_found))
remainder_domains.show(50, truncate=False)

In [None]:
websites_found = []

# websites_found = [
#     "viatechinfo.com.br",
#     "",
#     "",
#     "",
#     "",
#     "grupotravessia.com",
#     "cursobeta.com.br",
#     "supergeeks.com.br",
#     "wpensar.com.br",
#     "",
#     "",
#     "cursoyes.com.br",
#     "propulse.global",
#     "formei.me",
#     "fybot.io",
#     "plataforma.edibrasil.org",
#     "escolamais.com",
#     "",
#     "myprofitweb.com",
#     "astonpaybr.com",
#     "hypercash.com.br",
#     "portaldecompraspublicas.com.br",
#     "virtutech.ai",
#     "",
#     "olhonocarro.com.br",
#     "minhasupervida.com.br",
#     "anadem.org.br",
#     "yuppietech.com.br",
#     "onebox.one"
# ]

labels_udf = F.udf(lambda indx: websites_found[indx-1], StringType())

# Create a column with continuous increasing Id's 
remainder_domains = remainder_domains.withColumn("num_id", row_number().over( Window.orderBy(monotonically_increasing_id())))

# Create a new column by calling the user defined function 
remainder_domains = remainder_domains.withColumn("dominio", labels_udf("num_id"))
remainder_domains = remainder_domains.withColumn("host", labels_udf("num_id"))
remainder_domains = remainder_domains.drop("num_id")
remainder_domains = remainder_domains.filter(col("dominio") != "")
remainder_domains = remainder_domains.select("dominio", "host", "raiz_cnpj", "cnpj", "nome")
remainder_domains.show()

In [None]:
df = df.union(remainder_domains)
df.show(50)

In [None]:
# df.write.parquet("../data/iugu_enrichment.parquet", mode="overwrite")

In [None]:
spark.stop()