# Imports

In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

# Feature Engineering

In [None]:
def check_integrity(dataframe):
    try:
        columns_expected = [
            'domain',
            'html',
            ]
        
        if not all(item in dataframe.columns.tolist() for item in columns_expected):
            raise Exception('Missing required columns. Columns expected:\n' + str(columns_expected))
        
        dataframe['html'] = dataframe['html'].astype(str)

        dataframe_filtered = dataframe[(dataframe['html'] != '[]') & 
                                (dataframe['html'] != '')]
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with empty HTML and/or does not ends with '.br'. Removing those entries.")
            dataframe = dataframe_filtered

        dataframe_filtered = dataframe.drop_duplicates()
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with duplicates values. Removing those entries.")
            dataframe = dataframe_filtered
    
    
        nulls = dataframe['domain'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'domain' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['domain'])

        nulls = dataframe['html'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'html' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['html'])
        
        return dataframe
    except Exception as e:
        raise Exception('Failed in integrity check.\nError:\n' + str(e))

In [None]:
def build_lemmatizer_pt_dict():
    try:
        import os
        import requests
        
        url = "https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt"
        file_name = "lemmatization-pt.txt"

        # Verificar se o arquivo já existe
        if not os.path.exists(file_name):
            response = requests.get(url)
            with open(file_name, 'wb') as f:
                f.write(response.content)

        # Processar o arquivo
        lemmatizer_pt_dict = {}
        with open(file_name, 'r') as dic:
            for line in dic:
                txt = line.split()
                if len(txt) == 2:
                    lemmatizer_pt_dict[txt[1]] = txt[0]

        return lemmatizer_pt_dict
    except Exception as e:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

    finally:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)


In [None]:
def custom_lemmatizer(tokens, lemmatizer_pt_dict):
    try:
      from nltk.stem.wordnet import WordNetLemmatizer
  
      lemmatizer = WordNetLemmatizer()
      tokens_lemmatized = []
      for token in tokens:
        if token in lemmatizer_pt_dict.keys():
          tokens_lemmatized.append(lemmatizer_pt_dict.get(token))
        else:
          tokens_lemmatized.append(lemmatizer.lemmatize(token))

      return tokens_lemmatized
    except Exception as e:
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [None]:
def process_html_for_vectorizer(html_text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    from bs4 import BeautifulSoup
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    try:              
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))

        # pegar somente o body do HTML
        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.body.get_text() if soup.body else ''

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remover espaços em branco e quebras de linha desnecessárias
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # Regex para identificar palavras
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')

        # Substituir as correspondências por elas mesmas precedidas por um espaço
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()

        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]

        # Aplicar lemmatizer
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

In [None]:
from bs4 import BeautifulSoup
import re
def process_html_for_how_many_prices(text):
    try:              
        regex_precos = re.compile(r'\$|R\$')
        precos = regex_precos.findall(text)
        return len(precos)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for prices.\nError:\n' + str(e))

def process_html_for_how_many_values(text):
    try:              
        regex_valores = re.compile(r'\d+(?:\.\d{3})*(?:,\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?')
        valores = regex_valores.findall(text)
        return len(valores)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for values.\nError:\n' + str(e))

def get_html_body(html_str):
    try:
        soup = BeautifulSoup(html_str, 'html.parser')
        text = soup.body.get_text() if soup.body else ''
        return text
    except Exception as e:
        raise Exception('An error occurred while trying to get HTML body.\nError:\n' + str(e))

In [None]:
def get_html_links(row):
    try:
        from scrapy.http import HtmlResponse
        from scrapy.linkextractors import LinkExtractor
        
        # Definindo os parâmetros do LinkExtractor
        allowed_domains = []
        tags = ['a', 'area']
        attrs = ['href',]
        link_extractor = LinkExtractor(allow_domains=allowed_domains, tags=tags, attrs=attrs, unique=False)

        html = row['html']
        domain = row['domain']
        response = HtmlResponse(url=domain, body=html, encoding='utf-8')
        links = link_extractor.extract_links(response)

        return [link.url for link in links]
    except Exception as e:
        raise Exception('An error occurred while searching for links in HTML.\nError:\n' + str(e))

In [None]:
def get_autoreference_links_from_html(row):
    from scrapy.http import HtmlResponse
    from scrapy.linkextractors import LinkExtractor
    from urllib.parse import urlparse, urljoin
    
    html = row['html']
    domain = row['domain']
    parsed_domain = urlparse(domain).netloc  # Parse the domain to get the netloc part

    response = HtmlResponse(url=domain, body=html, encoding='utf-8')
    link_extractor = LinkExtractor(tags=['a', 'area'], attrs=['href'], unique=False)
    links = link_extractor.extract_links(response)

    autoreference_links = []
    for link in links:
        link_url = urlparse(link.url)
        # Check if the link is a relative link or it belongs to the same domain
        if not link_url.netloc or link_url.netloc == parsed_domain:
            # Resolve relative link to absolute URL
            full_url = urljoin(domain, link.url)
            autoreference_links.append(full_url)

    return autoreference_links

In [None]:
def only_number(text):
    text = re.sub(r'[^\d]', '', text)
    return text

def remove_invalid_company(company_id):
    company_id = re.sub(r'(\d)\1{12}', '', company_id)
    if len(company_id) == 14:
        return company_id
    return None 

def order_by_common(data):
    from collections import Counter
    data_output = Counter(data)
    return [k for k, v in data_output.most_common()]

def extract_and_process_cnpjs(text):
    pattern = re.compile(r'\d{2}\.\d{3}\.\d{3}[\/ ]\d{4}[- ]\d{2}')
    matches = pattern.findall(text)
    processed_matches = []
    for match in matches:
        cleaned = only_number(match)
        valid_company = remove_invalid_company(cleaned)
        if valid_company:
            processed_matches.append(valid_company)
    return processed_matches

In [None]:
def get_features_dataframe(dataframe, aditional_columns):
    try:
        # Colunas necessárias para o modelo
        feature_columns = []

        for columns in aditional_columns:
            feature_columns += columns
        df_features = dataframe.loc[:, feature_columns]
        
        return df_features
    except Exception as e:
        raise Exception('An error occurred while trying to build features DataFrame.\nError:\n' + str(e))

In [None]:
def generate_features(dataframe):
    try:
        dataframe = check_integrity(dataframe)

        lem_dict = build_lemmatizer_pt_dict()    
        html_body = dataframe.loc[:,'html'].apply(get_html_body)    
        dataframe.loc[:, 'tokens'] = dataframe.loc[:, 'html'].apply(lambda x: process_html_for_vectorizer(x, lem_dict))
        # dataframe.loc[:, 'html_size'] = dataframe.loc[:, 'html'].apply(len)
        # dataframe.loc[:, 'qntd_tokens'] = dataframe.loc[:, 'tokens'].apply(len)
        # dataframe.loc[:, 'qntd_tokens_unicos'] = dataframe.loc[:, 'tokens'].apply(lambda x: len(set(x)))

        # dataframe.loc[:, 'autoreference_links'] = dataframe.apply(get_autoreference_links_from_html, axis=1)
        # dataframe.loc[:, 'qntd_autoreference_links'] = dataframe.loc[:, 'autoreference_links'].apply(len)

        # dataframe.loc[:, 'links'] = dataframe.apply(get_html_links, axis=1)
        # dataframe.loc[:, 'qntd_links'] = dataframe.loc[:, 'links'].apply(len)

        dataframe.loc[:, 'processed_cnpjs'] = dataframe.loc[:, 'html'].apply(extract_and_process_cnpjs)
        dataframe.loc[:, 'has_cnpj'] = dataframe.loc[:, 'processed_cnpjs'].apply(bool)

        dataframe.loc[:, 'count_prices'] = html_body.apply(process_html_for_how_many_prices)
        # #dataframe.loc[:, 'count_prices'] = dataframe.loc[:, 'prices'].apply(len)
        dataframe['has_prices'] = dataframe['count_prices'] > 1

        # dataframe.loc[:, 'count_values'] = html_body.apply(process_html_for_how_many_values)
        # #dataframe.loc[:, 'count_values'] = dataframe.loc[:, 'values'].apply(len)
        
        # aditional_columns = [
        #     ['true_ecommerce']
        # ]
        # df_features = get_features_dataframe(dataframe, aditional_columns)

        return dataframe#, df_features
    except Exception as e:
        raise Exception('An error occured while trying to generate features.\nError:\n' + str(e))

# Loading Data

In [None]:
# df = pd.read_parquet('../data/validated_test_samples.parquet').rename(columns={'website': 'domain', 'is_true_ecommerce': 'true_ecommerce'}).reset_index(drop=True)
# df = pd.read_parquet('../data/validated_data_pagarme.parquet').rename(columns={'host': 'domain', 'is_true_ecommerce': 'true_ecommerce'}).reset_index(drop=True)
df = pd.read_parquet('../data/validated_data_pagarme_lista8_plataformas.parquet').rename(columns={'is_true_ecommerce': 'true_ecommerce'}).reset_index(drop=True)
# df = pd.read_parquet('../data/validated_data_pagarme_non_ecomm_subset.parquet').rename(columns={'host': 'domain'}).reset_index(drop=True)

df['html'] = df['html'].astype(str)
df_copy = df.copy()

df = df[['domain', 'html']]
df = generate_features(df)
indices = df.index.tolist()

df_y = df_copy[df_copy.index.isin(indices)]['true_ecommerce']
df = df.reset_index(drop=True)
df.head()

In [None]:
df_y.value_counts()

In [None]:
df_y.shape, df.shape

In [None]:
df.tail()

In [None]:
token_strings = [' '.join(doc) for doc in df['tokens']]

# Concatenar os dataframes de features
features = ['has_cnpj', 'has_prices']
other_features = df[features]

In [None]:
THRESHOLD = 0.6

def predict_proba_with_domain(X, estimator, vectorizer, threshold=0.5):
    model_predictions_prob = estimator.predict_proba(X)

    y_probs_0, y_probs_1 = zip(*model_predictions_prob)
    y_probs_0 = list(y_probs_0)
    y_probs_1 = list(y_probs_1)
    y_preds = np.asarray(list(map(lambda x: int(x >= threshold), y_probs_1)))

    return y_preds, y_probs_0, y_probs_1

# Test using last version model

In [None]:
last_model_path = "../models/MODEL_ecommerce_tfidf_vectorizer_mnb_custom_lemmatizer_3_True_42_1000_spiderwebv4_dataset_html.pkl"
last_vectorizer_path = "../models/VECTORIZER_ecommerce_tfidf_vectorizer_mnb_custom_lemmatizer_3_True_42_1000_spiderwebv4_dataset_html.pkl"

with open(last_vectorizer_path, "rb") as f:
    last_version_vectorizer = pickle.load(f)

with open(last_model_path, "rb") as f:
    last_version_model = pickle.load(f)

In [None]:
# Converter a matriz TF-IDF em um dataframe pandas
tfidf_matrix = last_version_vectorizer.transform(token_strings)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=last_version_vectorizer.get_feature_names_out())

features_df = pd.concat([other_features, tfidf_df], axis=1)
features_df.head()

In [None]:
y_preds, y_probs_0, y_probs_1 = predict_proba_with_domain(
    features_df,
    last_version_model,
    last_version_vectorizer,
    threshold=THRESHOLD,
)

In [None]:
pprint(f"Accuracy: {round(accuracy_score(y_true=df_y, y_pred=y_preds), 4)}")
pprint(f"F1-Score: {round(f1_score(y_true=df_y, y_pred=y_preds), 4)}")

In [None]:
pprint(classification_report(y_true=df_y, y_pred=y_preds))

In [None]:
cm_last = confusion_matrix(y_true=df_y, y_pred=y_preds)

# Test using current version model

In [None]:
# current_model_path = "../models/MODEL_v1_ecommerce_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_noisy_training_data.pkl"
# current_vectorizer_path = "../models/VECTORIZER_v1_ecommerce_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_noisy_training_data.pkl"

current_model_path = "../models/MODEL_v1_ecommerce_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_training_data.pkl"
current_vectorizer_path = "../models/VECTORIZER_v1_ecommerce_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_training_data.pkl"

# current_model_path = "../models/MODEL_v1_ecommerce_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_training_data_extra_samples.pkl"
# current_vectorizer_path = "../models/VECTORIZER_v1_ecommerce_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_training_data_extra_samples.pkl"

with open(current_vectorizer_path, "rb") as f:
    current_version_vectorizer = pickle.load(f)

with open(current_model_path, "rb") as f:
    current_version_model = pickle.load(f)

In [None]:
# Converter a matriz TF-IDF em um dataframe pandas
tfidf_matrix = current_version_vectorizer.transform(token_strings)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=current_version_vectorizer.get_feature_names_out())

features_df = pd.concat([other_features, tfidf_df], axis=1)
features_df.head()

In [None]:
y_preds, y_probs_0, y_probs_1 = predict_proba_with_domain(
    features_df,
    current_version_model,
    current_version_vectorizer,
    threshold=THRESHOLD,
)

In [None]:
pprint(f"Accuracy: {round(accuracy_score(y_true=df_y, y_pred=y_preds), 4)}")
pprint(f"F1-Score: {round(f1_score(y_true=df_y, y_pred=y_preds), 4)}")

In [None]:
pprint(classification_report(y_true=df_y, y_pred=y_preds))

In [None]:
cm_current = confusion_matrix(y_true=df_y, y_pred=y_preds)

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

# Gera o heatmap da confusion matrix
# plt.figure(figsize=(5,5))
sns.heatmap(cm_last, 
            annot=True, 
            fmt="d", 
            linewidths=.5, 
            square = True, 
            cmap = 'Blues', 
            annot_kws={"size": 16}, 
            xticklabels=['non_ecom', 'ecom'], 
            yticklabels=['non_ecom', 'ecom'], ax=axs[0])

axs[0].set_title('Modelo Atual', size = 20)
# axs[0].set_xticks(rotation='horizontal', fontsize=16)
# axs[0].set_yticks(rotation='horizontal', fontsize=16)
axs[0].set_xlabel('Predicted Label', size=20)
axs[0].set_ylabel('Actual Label', size=20)

# Gera o heatmap da confusion matrix
# plt.figure(figsize=(5,5))
sns.heatmap(cm_current, 
            annot=True, 
            fmt="d", 
            linewidths=.5, 
            square = True, 
            cmap = 'Blues', 
            annot_kws={"size": 16}, 
            xticklabels=['non_ecom', 'ecom'], 
            yticklabels=['non_ecom', 'ecom'], ax=axs[1])

axs[1].set_title('Modelo Treinado com Dados Validados', size = 20)
# axs[1].xticks(rotation='horizontal', fontsize=16)
# axs[1].yticks(rotation='horizontal', fontsize=16)
axs[1].set_xlabel('Predicted Label', size=20)
axs[1].set_ylabel('Actual Label', size=20)

plt.tight_layout()
# plt.savefig("../images/16-04-2025/confusion_matrix_dados_analise_probabilidade_lista8.png")