In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, rand
from pyspark.sql.functions import round as ps_round

In [None]:
conf = SparkConf().setAppName("App").setMaster("local[*]")

# Habilitar otimizações e configurações adicionais
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.sql.repl.eagerEval.enabled", "true")
conf.set("spark.sql.repl.eagerEval.truncate", 100)
conf.set("spark.sql.execution.arrow.pyspark.ignore_timezone", "true")

# AWS S3 CONNECTION
AWS_ENDPOINT_URL = "https://s3.bhs.io.cloud.ovh.net"
AWS_ACCESS_KEY = ""
AWS_SECRET_KEY = ""
AWS_REGION = "bhs"

conf.set("spark.jars", "/home/shared/drivers/postgresql-42.7.2.jar")
conf.set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", AWS_ENDPOINT_URL)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.jars.packages", "io.delta:delta-spark_2.12:3.1.0,org.apache.hadoop:hadoop-aws:3.2.2")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")

# Configurações de tempo e legacy
conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
conf.set("spark.sql.parquet.datetimeRebaseModeInWrite","LEGACY")

# Configurações de memória
conf.set("spark.driver.memory", "60g")
conf.set("spark.executor.memory", "60g")
conf.set("spark.executor.pyspark.memory", "60g")
conf.set("spark.memory.offHeap.enabled", "true")
conf.set("spark.memory.offHeap.size", "60g")

# Inicializa o SparkSession com a configuração
spark = SparkSession.builder.config(conf=conf).getOrCreate()

print("Spark session configurada com sucesso!")

In [None]:
DB_HOST = "driva-db.driva.io"
DB_PORT = 5432
DB_NAME = "postgres"
DB_ECOMM_SCHEMA = "sites.ecommerces"
DB_TECH_DRIVA = "sites.ecommerces_tech_driva"
DB_USER = ""
DB_PASSWORD = ""

ecomm_df = spark.read \
    .format("jdbc") \
    .option("url", f"jdbc:postgresql://{DB_HOST}:{DB_PORT}/{DB_NAME}") \
    .option("dbtable", DB_ECOMM_SCHEMA) \
    .option("user", DB_USER) \
    .option("password", DB_PASSWORD) \
    .option("driver", "org.postgresql.Driver") \
    .load()

tech_df = spark.read \
    .format("jdbc") \
    .option("url", f"jdbc:postgresql://{DB_HOST}:{DB_PORT}/{DB_NAME}") \
    .option("dbtable", DB_TECH_DRIVA) \
    .option("user", DB_USER) \
    .option("password", DB_PASSWORD) \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [None]:
ecomm_df.printSchema()

In [None]:
tech_df.printSchema()

In [None]:
tech_df.select("tech_driva").distinct().collect()

In [None]:
ecomm_df = ecomm_df.select("host", "dominio", "probabilidade")
merged_df = ecomm_df.join(tech_df, on="host", how="inner")

print((merged_df.count(), len(merged_df.columns)))

In [None]:
merged_df.show()

In [None]:
merged_df = merged_df.dropDuplicates(subset=["dominio", "tech_driva"])
merged_df.show()

In [None]:
top_k = 6
top_tech = merged_df.groupBy("tech_driva") \
    .count() \
    .withColumn("count", ps_round((col("count") / merged_df.count()) * 100, 2)) \
    .sort(col("count").desc()) \
    .withColumnRenamed("count", "percentage (%)")

top_k_tech = top_tech.limit(top_k) \
    .select("tech_driva") \
    .collect()
top_k_tech = [tech.tech_driva for tech in top_k_tech]

merged_df = merged_df.filter(col("tech_driva").isin(top_k_tech))
print((merged_df.count(), len(merged_df.columns)))

In [None]:
top_tech.show()

In [None]:
intervals = [
    (0.0, 0.1),
    (0.1, 0.2),
    (0.2, 0.3),
    (0.3, 0.4),
    (0.4, 0.5),
    (0.5, 0.6),
    (0.6, 0.7),
    (0.7, 0.8),
    (0.8, 0.9),
    (0.9, 1.0)
]
data = []

for (min_interval, max_interval) in intervals:
    temp = merged_df.filter((col("probabilidade") > min_interval) & (col("probabilidade") <= max_interval))
    row = [
        (min_interval, max_interval),
        temp.count(),
        round((temp.count() / merged_df.count()) * 100, 2)
    ]
    data.append(row)

columns = ["interval", "count", "percentage"]
dataframe = spark.createDataFrame(data, columns)
dataframe = dataframe.sort(col("percentage").desc())
dataframe.show()

In [None]:
high_prob_ecommerce = merged_df.where((col("probabilidade") > 0.3) & (col("probabilidade") <= 0.6))#.dropDuplicates(subset=["dominio"])
high_prob_ecommerce = high_prob_ecommerce.withColumn("predicted_as_ecommerce", lit(True))
high_prob_ecommerce.show()

print((high_prob_ecommerce.count(), len(high_prob_ecommerce.columns)))

In [None]:
low_prob_ecommerce = merged_df.where((col("probabilidade") > 0.1) & (col("probabilidade") <= 0.3))#.dropDuplicates(subset=["dominio"])
low_prob_ecommerce = low_prob_ecommerce.withColumn("predicted_as_ecommerce", lit(False))
low_prob_ecommerce.show()

print((low_prob_ecommerce.count(), len(low_prob_ecommerce.columns)))

In [None]:
def filter_dataframe(df, n_samples, seed):
    df_copy = spark.createDataFrame(df.rdd.map(lambda x: x), schema=df.schema)
    new_df = spark.createDataFrame(
        spark.sparkContext.emptyRDD(),
        schema=df.schema,
    )
    new_top_tech = df.groupBy("tech_driva") \
        .count() \
        .withColumn("count", ps_round((col("count") / df.count()) * 100, 2)) \
        .withColumnRenamed("count", "percentage (%)")
        
    for tech_driva, pct in new_top_tech.rdd.collect():
        tech_samples = (int(pct * n_samples) + 1) // 100

        temp = df_copy.filter(col("tech_driva") == tech_driva)
        temp = spark.createDataFrame(
            temp.rdd.takeSample(
                withReplacement=False,
                num=tech_samples,
                seed=seed,
            ),
            schema=df.schema,
        )

        unique_domains = temp.select("tech_driva").distinct().collect()
        unique_domains = [ud.tech_driva for ud in unique_domains]
        df_copy = df_copy.filter(col("tech_driva").isin(unique_domains) == False)

        new_df = new_df.union(temp)

    return new_df

seed = 42
n_samples = 5000

high_prob_ecommerce_filtered = filter_dataframe(
    df=high_prob_ecommerce,
    n_samples=n_samples,
    seed=seed,
)

print((high_prob_ecommerce_filtered.count(), len(high_prob_ecommerce_filtered.columns)))

low_prob_ecommerce_filtered = filter_dataframe(
    df=low_prob_ecommerce,
    n_samples=n_samples,
    seed=seed,
)

print((low_prob_ecommerce_filtered.count(), len(low_prob_ecommerce_filtered.columns)))

In [None]:
full_df = high_prob_ecommerce_filtered.union(low_prob_ecommerce_filtered)
full_df = full_df.dropDuplicates(subset=["dominio"])
full_df = full_df.orderBy(rand())

print(full_df.show())
print((full_df.count(), len(full_df.columns)))

In [None]:
# full_df.write.save("s3a://drivalake/trusted/sites/ecommerces/dados_treino/mid_prob_data_samples.parquet")

In [None]:
spark.stop()