In [16]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import FloatType, BooleanType, StructField, StructType, DoubleType, ArrayType
from pyspark.sql.functions import lit

import pickle
import math
import time
import re
import pandas as pd
import numpy as np

import os
from dotenv import load_dotenv
load_dotenv()

AWS_ENDPOINT_URL = os.getenv('AWS_ENDPOINT_URL')
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')

In [17]:
conf = SparkConf().setAppName("Spark com S3").setMaster("local[*]")

conf.set("spark.driver.memory", "70g")
conf.set("spark.executor.memory", "70g")
conf.set("spark.executor.pyspark.memory", "70g")

# conf.set("spark.driver.cores", "20")
# conf.set("spark.executor.cores", "20")

# conf.set("spark.memory.offHeap.enabled", "true")
# conf.set("spark.memory.offHeap.size", "20g")

# conf.set("spark.sql.shuffle.partitions", "2000")
# conf.set("spark.sql.parquet.columnarReaderBatchSize", "2048") 
conf.set("spark.sql.parquet.enableVectorizedReader", "false")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.sql.repl.eagerEval.enabled", "true")
conf.set("spark.sql.repl.eagerEval.truncate", 100)

conf.set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", AWS_ENDPOINT_URL)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

24/06/24 20:01:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [18]:
# import logging
# logger = spark._jvm.org.apache.log4j
# logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
# logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
# logging.getLogger("py4j").setLevel(logging.ERROR)

In [19]:
import boto3
# Inicialize o cliente boto3 para listar os objetos na pasta S3
s3 = boto3.client('s3', endpoint_url='https://s3.bhs.io.cloud.ovh.net')
bucket_name = 'drivalake'
prefix = 'sites/bronze/spiderwebv4/countries_'

# Função para listar todos os arquivos no bucket/prefix
def list_s3_files(bucket, prefix):
    files = []
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    for content in response.get('Contents', []):
        files.append(content['Key'])
    while 'NextContinuationToken' in response:
        continuation_token = response['NextContinuationToken']
        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        for content in response.get('Contents',  []):
            files.append(content['Key'])
    return files

# Listar todos os arquivos
files = list_s3_files(bucket_name, prefix)

# Model helper functions

In [20]:
def check_integrity(domains, htmls):
    try:
        # Transformar html para string
        htmls = [str(html) for html in htmls]
        
        # Filtrar entradas
        filtered_data = [(domain, html) for domain, html in zip(domains, htmls) 
                         if html != '[]' and html != '' and domain.endswith('.br')]
        
        if len(filtered_data) != len(domains):
            count = len(domains) - len(filtered_data)
            print(f"WARNING: dataset has {count} entries with empty HTML and/or does not end with '.br'. Removing those entries.")
        
        # Remover duplicatas
        filtered_data = list(set(filtered_data))
        if len(filtered_data) != len(domains):
            count = len(domains) - len(filtered_data)
            print(f"WARNING: dataset has {count} entries with duplicate values. Removing those entries.")
        
        # Remover valores nulos
        filtered_data = [(domain, html) for domain, html in filtered_data if domain and html]
        nulls = len(domains) - len(filtered_data)
        if nulls > 0:
            print(f"WARNING: dataset has {nulls} entries with null values. Removing those entries.")
        
        # Separar as listas filtradas
        filtered_domains, filtered_htmls = zip(*filtered_data) if filtered_data else ([], [])
        
        return list(filtered_domains), list(filtered_htmls)
    except Exception as e:
        raise Exception('Failed in integrity check.\nError:\n' + str(e))

In [21]:
def build_lemmatizer_pt_dict():
    try:
        import os
        import requests
        
        url = "https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt"
        file_name = "lemmatization-pt.txt"

        if not os.path.exists(file_name):
            response = requests.get(url)
            with open(file_name, 'wb') as f:
                f.write(response.content)

        lemmatizer_pt_dict = {}
        with open(file_name, 'r') as dic:
            for line in dic:
                txt = line.split()
                if len(txt) == 2:
                    lemmatizer_pt_dict[txt[1]] = txt[0]

        return lemmatizer_pt_dict
    except Exception as e:
        file_name = "lemmatization-pt.txt"
        # if os.path.exists(file_name):
        #     os.remove(file_name)
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [22]:
def custom_lemmatizer(tokens, lemmatizer_pt_dict):
    try:
        from nltk.stem.wordnet import WordNetLemmatizer
        
        lemmatizer = WordNetLemmatizer()
        tokens_lemmatized = []
        for token in tokens:
            if token in lemmatizer_pt_dict.keys():
                tokens_lemmatized.append(lemmatizer_pt_dict.get(token))
            else:
                tokens_lemmatized.append(lemmatizer.lemmatize(token))

        return tokens_lemmatized
    except Exception as e:
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [23]:
def process_html_for_vectorizer(html_text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    from bs4 import BeautifulSoup
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    try:
        STOP_WORDS = set(stopwords.words('portuguese')).union(set(stopwords.words('english')))

        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.body.get_text() if soup.body else ''

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)
        
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)
        
        preprocessed_text = preprocessed_text.lower()
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()
        
        tokens = nltk.word_tokenize(preprocessed_text)
        tokens = [token for token in tokens if token not in STOP_WORDS and len(token) > 2]
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

In [24]:
def process_html_for_how_many_prices(text):
    import re
    try:              
        regex_precos = re.compile(r'\$|R\$')
        precos = regex_precos.findall(text)
        return len(precos)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for prices.\nError:\n' + str(e))

In [25]:
def only_number(text):
    text = re.sub(r'[^\d]', '', text)
    return text

def remove_invalid_company(company_id):
    company_id = re.sub(r'(\d)\1{12}', '', company_id)
    if len(company_id) == 14:
        return company_id
    return None 

def order_by_common(data):
    from collections import Counter
    data_output = Counter(data)
    return [k for k, v in data_output.most_common()]

def extract_and_process_cnpjs(text):
    pattern = re.compile(r'\d{2}\.\d{3}\.\d{3}[\/ ]\d{4}[- ]\d{2}')
    matches = pattern.findall(text)
    processed_matches = []
    for match in matches:
        cleaned = only_number(match)
        valid_company = remove_invalid_company(cleaned)
        if valid_company:
            processed_matches.append(valid_company)
    return processed_matches

In [26]:
def get_html_body(html_str):
    from bs4 import BeautifulSoup
    try:
        soup = BeautifulSoup(html_str, 'html.parser')
        text = soup.body.get_text() if soup.body else ''
        return text
    except Exception as e:
        raise Exception('An error occurred while trying to get HTML body.\nError:\n' + str(e))

In [27]:
def generate_features(domains, htmls):
    try:
        domains, htmls = check_integrity(domains, htmls)
        lem_dict = build_lemmatizer_pt_dict()
        
        html_bodies = [get_html_body(html) for html in htmls]
        tokens_list = [process_html_for_vectorizer(html, lem_dict) for html in htmls]
        
        processed_cnpjs = [extract_and_process_cnpjs(html) for html in htmls]
        has_cnpj = [bool(cnpjs) for cnpjs in processed_cnpjs]
        
        count_prices = [process_html_for_how_many_prices(html_body) for html_body in html_bodies]
        has_prices = [count > 1 for count in count_prices]

        return {
            'domains': domains,
            'htmls': htmls,
            'tokens': tokens_list,
            'processed_cnpjs': processed_cnpjs,
            'has_cnpj': has_cnpj,
            'count_prices': count_prices,
            'has_prices': has_prices
        }
    except Exception as e:
        raise Exception('An error occurred while generating features.\nError:\n' + str(e))

In [28]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

def generate_features_without_pandas(domains, HTML_raw, vectorizer):
    try:
        lem_dict = build_lemmatizer_pt_dict()

        # Processar cada par de domínio e HTML individualmente
        features = []
        for domain, html in zip(domains, HTML_raw):
            tokens = process_html_for_vectorizer(html, lem_dict)
            processed_cnpjs = extract_and_process_cnpjs(html)
            has_cnpj = bool(processed_cnpjs)
            has_prices = process_html_for_how_many_prices(html) > 0

            features.append({
                'tokens': tokens,
                'has_cnpj': has_cnpj,
                'has_prices': has_prices
            })

        return features

    except Exception as e:
        raise Exception('An error occurred while generating features without Pandas.\nError:\n' + str(e))

def predict_proba_with_domain(domains, HTML_raw, estimator, vectorizer):
    try:
        # Gerar as características a partir dos domínios e HTMLs
        features = generate_features_without_pandas(domains, HTML_raw, vectorizer)

        # Extrair tokens e transformar em strings para vetorização
        token_strings = [' '.join(feature['tokens']) for feature in features]

        # Transformar os tokens em matriz TF-IDF usando o vetorizador
        tfidf_matrix = vectorizer.transform(token_strings)

        # Obter características adicionais
        has_cnpj = [feature['has_cnpj'] for feature in features]
        has_prices = [feature['has_prices'] for feature in features]

        # Criar DataFrame com as características adicionais
        additional_features = pd.DataFrame({
            'has_cnpj': has_cnpj,
            'has_prices': has_prices
        })

        # Converter a matriz TF-IDF em DataFrame
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

        # Concatenar todas as características
        combined_features = pd.concat([additional_features, tfidf_df], axis=1)

        # Fazer a previsão de probabilidades usando o modelo
        model_predictions_prob = estimator.predict_proba(combined_features)

        # Extrair probabilidades das classes
        y_probs_0 = model_predictions_prob[:, 0]
        y_probs_1 = model_predictions_prob[:, 1]

        # Gerar previsões baseadas nas probabilidades
        y_preds = (y_probs_1 >= 0.5).astype(int)

        return y_preds, y_probs_0, y_probs_1

    except Exception as e:
        raise Exception('An error occurred while predicting with domain.\nError:\n' + str(e))


# Load model and process!

In [None]:
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.types import StructType, StructField, DoubleType, BooleanType
import numpy as np
import pickle

# Pré-carregar o modelo e o vetor
print('loading model...')
with open('../models/MODEL_ecommerce_tfidf_vectorizer_mnb_custom_lemmatizer_3_True_42_1000_spiderwebv4_dataset_html.pkl', "rb") as serialized_model:
    model = pickle.load(serialized_model)

with open('../models/VECTORIZER_ecommerce_tfidf_vectorizer_mnb_custom_lemmatizer_3_True_42_1000_spiderwebv4_dataset_html.pkl', "rb") as serialized_vectorizer:
    vectorizer = pickle.load(serialized_vectorizer)

# Broadcast model and vectorizer to Spark executors
broadcast_model = spark.sparkContext.broadcast(model)
broadcast_vectorizer = spark.sparkContext.broadcast(vectorizer)

# Obtenha os nomes das colunas de recursos
feature_names = broadcast_vectorizer.value.get_feature_names_out()

# Definir a função de predição fora do loop
def predictor(domain, html):
    y_preds, y_probs_0, y_probs_1 = predict_proba_with_domain([domain], [html], broadcast_model.value, broadcast_vectorizer.value)
    return float(y_probs_1[0]), bool(y_preds[0])

result_schema = StructType([
    StructField("probability", DoubleType()),
    StructField("prediction", BooleanType())
])

# Registrar a função UDF uma vez
udf_predictor = udf(predictor, result_schema)

batch_size = 5
batches = [files[i:i + batch_size] for i in range(0, len(files), batch_size)]

# Carregar e processar cada parte separadamente
for i, batch in enumerate(batches):
    # if i < 8: continue # 74+

    print(f"Processing batch {i+1}/{len(batches)}")
    file_paths = [f"s3a://{bucket_name}/{file}" for file in batch]
    df_spider_br = spark.read.parquet(*file_paths)
    
    # Fazer o processamento necessário com df_batch
    print('preprocessing...')
    df_spider_br = df_spider_br.select('domain', 'html', 'status')
    df_spider_br = df_spider_br.withColumn('html', col('html').cast('string'))
    df_spider_br = df_spider_br.filter((col('status') == 200.0) & 
                                       (col('html') != '[]') & 
                                       (col('html') != '') & 
                                       (col('domain').endswith('.br')))
    df_spider_br = df_spider_br.select('domain', 'html')
    df_spider_br = df_spider_br.dropDuplicates()

    print('predicting...')
    df_with_predictions = df_spider_br.withColumn('results', udf_predictor(df_spider_br.domain, df_spider_br.html))
    
    # Criar colunas separadas para probability e prediction
    df_with_predictions = df_with_predictions.withColumn("probability", col("results.probability")) \
                                             .withColumn("prediction", col("results.prediction")) \
                                             .drop('results')

    # Selecionar apenas as colunas necessárias
    df_with_predictions = df_with_predictions.select('domain', 'html', 'probability', 'prediction')

    print('writing...')
    file_path = '../data/countries_filtered_with_predictions'
    df_with_predictions.write.parquet(file_path, mode='append')

spark.stop()

loading model...
Processing batch 1/127


                                                                                

preprocessing...
predicting...
writing...


24/06/24 20:01:44 ERROR Executor: Exception in task 9.0 in stage 1.0 (TID 10): /tmp/blockmgr-9212680d-b074-4f38-b25d-9ebc304d1372/22
24/06/24 20:01:55 WARN DiskBlockObjectWriter: Error deleting /tmp/blockmgr-9212680d-b074-4f38-b25d-9ebc304d1372/36/temp_shuffle_538ad78e-5bb3-414d-9566-e2aa675dfcb3
24/06/24 20:01:55 WARN DiskBlockObjectWriter: Error deleting /tmp/blockmgr-9212680d-b074-4f38-b25d-9ebc304d1372/33/temp_shuffle_19a9cc12-33a2-41a5-9797-76b45b753075
24/06/24 20:01:55 ERROR Executor: Exception in task 3.0 in stage 1.0 (TID 4): /tmp/blockmgr-9212680d-b074-4f38-b25d-9ebc304d1372/05
24/06/24 20:01:55 ERROR Executor: Exception in task 16.0 in stage 1.0 (TID 17): /tmp/blockmgr-9212680d-b074-4f38-b25d-9ebc304d1372/24
24/06/24 20:01:57 WARN DiskBlockObjectWriter: Error deleting /tmp/blockmgr-9212680d-b074-4f38-b25d-9ebc304d1372/29/temp_shuffle_56534bf2-dd36-4bd0-a269-ebeee45344b4
24/06/24 20:01:57 ERROR Executor: Exception in task 24.0 in stage 1.0 (TID 25): /tmp/blockmgr-9212680d-b07

KeyboardInterrupt: 



In [None]:
i

8

In [30]:
spark.stop()

24/06/24 20:08:09 ERROR FileFormatWriter: Aborting job 8a7b4b55-fd9b-4c72-866c-b919d094ed0f.
org.apache.spark.SparkException: Job 2 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:1251)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:1251)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:3087)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$stop$3(DAGScheduler.scala:2973)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1375)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2973)
	at org.apache.spark.SparkContext.$anonfun$stop$12(SparkContex

In [None]:
# df_spider_br.write.parquet('./data/spider_br/brazil_filtered.parquet', mode='error')
# df_test = spark.read.parquet('./data/spider_br/brazil_filtered.parquet')
# df_spider_br = spark.read.parquet('./data/spider_br/brazil_filtered.parquet')