In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import FloatType, BooleanType, StructField, StructType, DoubleType, ArrayType
import pickle

import os
from dotenv import load_dotenv
load_dotenv()

AWS_ENDPOINT_URL = os.getenv('AWS_ENDPOINT_URL')
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')

In [3]:
conf = SparkConf().setAppName("Spark com S3").setMaster("local[*]")

conf.set("spark.driver.memory", "100g")
conf.set("spark.executor.memory", "100g")
conf.set("spark.executor.pyspark.memory", "100g")

# conf.set("spark.driver.cores", "20")
# conf.set("spark.executor.cores", "20")

# conf.set("spark.memory.offHeap.enabled", "true")
# conf.set("spark.memory.offHeap.size", "20g")

# conf.set("spark.sql.shuffle.partitions", "2000")
# conf.set("spark.sql.parquet.columnarReaderBatchSize", "2048") 
conf.set("spark.sql.parquet.enableVectorizedReader", "false")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.sql.repl.eagerEval.enabled", "true")
conf.set("spark.sql.repl.eagerEval.truncate", 100)

conf.set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", AWS_ENDPOINT_URL)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

:: loading settings :: url = jar:file:/home/darrazao/git/accounting_website_classifier/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/darrazao/.ivy2/cache
The jars for the packages stored in: /home/darrazao/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c127a7ad-a742-4fbc-be8f-1bcf10feffd1;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 171ms :: artifacts dl 6ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------------

In [4]:
df_spider_br = spark.read.parquet("s3a://drivalake/sites/bronze/spiderwebv4/brazil_*")
df_spider_br.printSchema()

24/06/13 13:17:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

root
 |-- url: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- redirect: string (nullable = true)
 |-- headers: string (nullable = true)
 |-- cookies: string (nullable = true)
 |-- html: string (nullable = true)
 |-- date: string (nullable = true)
 |-- status: double (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- error: string (nullable = true)



In [5]:
df_spider_br.select('status').show()

[Stage 2:>    (0 + 1) / 1][Stage 3:>    (0 + 1) / 1][Stage 4:>    (0 + 1) / 1]  

+------+
|status|
+------+
| 200.0|
|  NULL|
|  NULL|
|  NULL|
| 404.0|
| 200.0|
| 200.0|
|  NULL|
|  NULL|
|  NULL|
|  NULL|
|  NULL|
|  NULL|
| 200.0|
|  NULL|
|  NULL|
|  NULL|
| 200.0|
| 200.0|
| 200.0|
+------+
only showing top 20 rows



In [8]:
df_spider_br = df_spider_br.select('domain', 'html', 'status')
df_spider_br = df_spider_br.withColumn('html', col('html').cast('string'))
df_spider_br = df_spider_br.filter((col('status') == 200.0) & (col('html') != '[]') & (col('html') != '') & (col('domain').endswith('.br')))
df_spider_br = df_spider_br.select('domain', 'html')
df_spider_br = df_spider_br.dropDuplicates()

In [9]:
df_spider_br.count()
# 181.486
# 2m 18.6s


                                                                                

181486

# Model helper functions

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
from bs4 import BeautifulSoup
import re

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

def custom_tokenizer(html_text):
    try:              
        # Pre processamento
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))
        lemmatizer = WordNetLemmatizer()

        # pegar somente o body do HTML
        soup = BeautifulSoup(html_text, "html.parser")
        text = soup.body.get_text() if soup.body else ""

        # pre processamento do texto
        # normalizar
        preprocessed_text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()
  
        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]
        
        # Aplicar lemmatizer
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        return tokens

    except Exception as e:
        print(e)
        print("Failed on custom_tokenizer, passing...")
        return []

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/darrazao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/darrazao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/darrazao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def predict_proba_with_domain(X_raw: list, estimator, vectorizer):

    test_tfid_X = vectorizer.transform(X_raw)
    model_predictions_prob = estimator.predict_proba(test_tfid_X)

    y_probs_0 = []
    y_probs_1 = []
    y_preds = []

    for prob_tuple in model_predictions_prob:
        y_probs_0.append(prob_tuple[0])
        y_probs_1.append(prob_tuple[1])

        if prob_tuple[1] >= 0.5:
            y_preds.append(1)
        else:
                y_preds.append(0)

    return y_preds, y_probs_0, y_probs_1

# Load model and process!

In [None]:
# Open picked model
serialized_model = open('../models/BEST_MODEL_ecomm_tfidf_vectorizer_ecomm_logistic_regression_lbfgs_lemmatizer_3_True_42_1000_ecomm_spiderwebv4_dataset_html.pkl', "rb")
model = pickle.load(serialized_model)
serialized_model.close()
# Open picked vectorizer
serialized_vectorizer = open('../models/BEST_VECTORIZER_ecomm_tfidf_vectorizer_ecomm_logistic_regression_lbfgs_lemmatizer_3_True_42_1000_ecomm_spiderwebv4_dataset_html.pkl', "rb")
vectorizer = pickle.load(serialized_vectorizer)
serialized_vectorizer.close()

# Broadcast model to spark executors
spark.sparkContext.broadcast(model)
spark.sparkContext.broadcast(vectorizer)

# prediction method
def predictor(html):
    y_preds, y_probs_0, y_probs_1 = predict_proba_with_domain([html], model, vectorizer)
    return (float(y_probs_1[0]), bool(y_preds[0]))

result_schema = StructType([
    StructField("probability", DoubleType()),
    StructField("prediction", BooleanType())
])

#register python method as spark UDF
udf_predictor = udf(predictor, result_schema)

In [17]:
df_with_predictions = df_spider_br.withColumn('results', udf_predictor(df_spider_br.html))

In [18]:
# Criar colunas separadas para probability e prediction
df_with_predictions = df_with_predictions.withColumn("probability", col("results.probability")) \
                                         .withColumn("prediction", col("results.prediction")) \
                                         .drop('results')

In [19]:
df_with_predictions.write.parquet('./data/ecomm/brazil_filtered_with_predictions', mode='overwrite') #9min, 181486

Current mem limits: -1 of max -1                                 (0 + 40) / 200]
Current mem limits: -1 of max -1


Setting mem limits to 107374182400 of max 107374182400

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits to 107374182400 of max 107374182400

Current mem limits: -1 of max -1

Setting mem limits

In [20]:
# Encerrar a sessão do Spark
spark.stop()

In [None]:
print(stop)

In [None]:
# Mostrar o DataFrame resultante
df_with_predictions.show()

In [None]:
# Filtrar as linhas onde a coluna "prediction" é True
df_prediction_true = df_with_predictions.filter(col("prediction") == True)

# Mostrar o DataFrame resultante
df_prediction_true.show()

In [None]:
df_prediction_true.count()
# 4560

In [None]:
# df_spider_br.write.parquet('./data/spider_br/brazil_filtered.parquet', mode='error')
# df_test = spark.read.parquet('./data/spider_br/brazil_filtered.parquet')
# df_spider_br = spark.read.parquet('./data/spider_br/brazil_filtered.parquet')