# Imports and df reading

In [None]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import unicodedata
import pickle
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

In [None]:
df = pd.read_parquet('../data/training_data.parquet').rename(columns={'website': 'domain', 'is_true_ecommerce': 'true_ecommerce'})
# df = pd.read_parquet('../data/noisy_training_data.parquet')
df['html'] = df['html'].astype(str)

In [None]:
# extra_df = pd.read_parquet('../data/extra_training_data.parquet').rename(columns={'website': 'domain', 'is_true_ecommerce': 'true_ecommerce'})
# # df = pd.read_parquet('../data/noisy_training_data.parquet')
# extra_df['html'] = extra_df['html'].astype(str)

# df = pd.concat([df, extra_df], axis=0, ignore_index=True)
# df.shape

In [None]:
df.head(5)

In [None]:
df['domain'].nunique()

In [None]:
df['true_ecommerce'].value_counts()

In [None]:
df.isnull().sum()

# Feature Engineering

In [None]:
def check_integrity(dataframe):
    try:
        columns_expected = [
            'domain',
            'html',
            ]
        
        if not all(item in dataframe.columns.tolist() for item in columns_expected):
            raise Exception('Missing required columns. Columns expected:\n' + str(columns_expected))
        
        dataframe['html'] = dataframe['html'].astype(str)

        dataframe_filtered = dataframe[(dataframe['html'] != '[]') & 
                                (dataframe['html'] != '') & 
                                (dataframe['domain'].str.endswith('.br'))]
    
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with empty HTML and/or does not ends with '.br'. Removing those entries.")
            dataframe = dataframe_filtered

        dataframe_filtered = dataframe.drop_duplicates(subset=["domain"])
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with duplicates values. Removing those entries.")
            dataframe = dataframe_filtered
    
        nulls = dataframe['domain'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'domain' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['domain'])

        nulls = dataframe['html'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'html' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['html'])
        
        return dataframe
    except Exception as e:
        raise Exception('Failed in integrity check.\nError:\n' + str(e))

In [None]:
def build_lemmatizer_pt_dict():
    try:
        import os
        import requests
        
        url = "https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt"
        file_name = "lemmatization-pt.txt"

        # Verificar se o arquivo já existe
        if not os.path.exists(file_name):
            response = requests.get(url)
            with open(file_name, 'wb') as f:
                f.write(response.content)

        # Processar o arquivo
        lemmatizer_pt_dict = {}
        with open(file_name, 'r') as dic:
            for line in dic:
                txt = line.split()
                if len(txt) == 2:
                    lemmatizer_pt_dict[txt[1]] = txt[0]

        return lemmatizer_pt_dict
    except Exception as e:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

    finally:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)


In [None]:
def custom_lemmatizer(tokens, lemmatizer_pt_dict):
    try:
      from nltk.stem.wordnet import WordNetLemmatizer
  
      lemmatizer = WordNetLemmatizer()
      tokens_lemmatized = []
      for token in tokens:
        if token in lemmatizer_pt_dict.keys():
          tokens_lemmatized.append(lemmatizer_pt_dict.get(token))
        else:
          tokens_lemmatized.append(lemmatizer.lemmatize(token))

      return tokens_lemmatized
    except Exception as e:
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [None]:
def process_html_for_vectorizer(html_text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    from bs4 import BeautifulSoup
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    try:              
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))

        # pegar somente o body do HTML
        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.body.get_text() if soup.body else ''

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remover espaços em branco e quebras de linha desnecessárias
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # Regex para identificar palavras
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')

        # Substituir as correspondências por elas mesmas precedidas por um espaço
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()

        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]

        # Aplicar lemmatizer
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

In [None]:
from bs4 import BeautifulSoup
import re
def process_html_for_how_many_prices(text):
    try:              
        regex_precos = re.compile(r'\$|R\$')
        precos = regex_precos.findall(text)
        return len(precos)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for prices.\nError:\n' + str(e))

def process_html_for_how_many_values(text):
    try:              
        regex_valores = re.compile(r'\d+(?:\.\d{3})*(?:,\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?')
        valores = regex_valores.findall(text)
        return len(valores)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for values.\nError:\n' + str(e))

def get_html_body(html_str):
    try:
        # Tentar usar diferentes parsers
        for parser in ['html.parser', 'html5lib', 'lxml']:
            try:
                soup = BeautifulSoup(html_str, parser)
                text = soup.body.get_text() if soup.body else ''
                return text
            except Exception as parser_e:
                continue
        
    except Exception as e:
        return ''

In [None]:
def get_html_links(row):
    try:
        from scrapy.http import HtmlResponse
        from scrapy.linkextractors import LinkExtractor
        
        # Definindo os parâmetros do LinkExtractor
        allowed_domains = []
        tags = ['a', 'area']
        attrs = ['href',]
        link_extractor = LinkExtractor(allow_domains=allowed_domains, tags=tags, attrs=attrs, unique=False)

        html = row['html']
        domain = row['domain']
        response = HtmlResponse(url=domain, body=html, encoding='utf-8')
        links = link_extractor.extract_links(response)

        return [link.url for link in links]
    except Exception as e:
        raise Exception('An error occurred while searching for links in HTML.\nError:\n' + str(e))

In [None]:
def get_autoreference_links_from_html(row):
    from scrapy.http import HtmlResponse
    from scrapy.linkextractors import LinkExtractor
    from urllib.parse import urlparse, urljoin
    
    html = row['html']
    domain = row['domain']
    parsed_domain = urlparse(domain).netloc  # Parse the domain to get the netloc part

    response = HtmlResponse(url=domain, body=html, encoding='utf-8')
    link_extractor = LinkExtractor(tags=['a', 'area'], attrs=['href'], unique=False)
    links = link_extractor.extract_links(response)

    autoreference_links = []
    for link in links:
        link_url = urlparse(link.url)
        # Check if the link is a relative link or it belongs to the same domain
        if not link_url.netloc or link_url.netloc == parsed_domain:
            # Resolve relative link to absolute URL
            full_url = urljoin(domain, link.url)
            autoreference_links.append(full_url)

    return autoreference_links

In [None]:
def only_number(text):
    text = re.sub(r'[^\d]', '', text)
    return text

def remove_invalid_company(company_id):
    company_id = re.sub(r'(\d)\1{12}', '', company_id)
    if len(company_id) == 14:
        return company_id
    return None 

def order_by_common(data):
    from collections import Counter
    data_output = Counter(data)
    return [k for k, v in data_output.most_common()]

def extract_and_process_cnpjs(text):
    pattern = re.compile(r'\d{2}\.\d{3}\.\d{3}[\/ ]\d{4}[- ]\d{2}')
    matches = pattern.findall(text)
    processed_matches = []
    for match in matches:
        cleaned = only_number(match)
        valid_company = remove_invalid_company(cleaned)
        if valid_company:
            processed_matches.append(valid_company)
    return processed_matches

In [None]:
def get_features_dataframe(dataframe, aditional_columns):
    try:
        # Colunas necessárias para o modelo
        feature_columns = []

        for columns in aditional_columns:
            feature_columns += columns
        df_features = dataframe.loc[:, feature_columns]
        
        return df_features
    except Exception as e:
        raise Exception('An error occurred while trying to build features DataFrame.\nError:\n' + str(e))

In [None]:
def generate_features(dataframe):
    try:
        dataframe = check_integrity(dataframe)

        lem_dict = build_lemmatizer_pt_dict()    
        html_body = dataframe.loc[:,'html'].apply(get_html_body)    
        dataframe.loc[:, 'tokens'] = dataframe.loc[:, 'html'].apply(lambda x: process_html_for_vectorizer(x, lem_dict))
        # dataframe.loc[:, 'html_size'] = dataframe.loc[:, 'html'].apply(len)
        # dataframe.loc[:, 'qntd_tokens'] = dataframe.loc[:, 'tokens'].apply(len)
        # dataframe.loc[:, 'qntd_tokens_unicos'] = dataframe.loc[:, 'tokens'].apply(lambda x: len(set(x)))

        # dataframe.loc[:, 'autoreference_links'] = dataframe.apply(get_autoreference_links_from_html, axis=1)
        # dataframe.loc[:, 'qntd_autoreference_links'] = dataframe.loc[:, 'autoreference_links'].apply(len)

        # dataframe.loc[:, 'links'] = dataframe.apply(get_html_links, axis=1)
        # dataframe.loc[:, 'qntd_links'] = dataframe.loc[:, 'links'].apply(len)

        dataframe.loc[:, 'processed_cnpjs'] = dataframe.loc[:, 'html'].apply(extract_and_process_cnpjs)
        dataframe.loc[:, 'has_cnpj'] = dataframe.loc[:, 'processed_cnpjs'].apply(bool)

        dataframe.loc[:, 'count_prices'] = html_body.apply(process_html_for_how_many_prices)
        # #dataframe.loc[:, 'count_prices'] = dataframe.loc[:, 'prices'].apply(len)
        dataframe['has_prices'] = dataframe['count_prices'] > 1

        # dataframe.loc[:, 'count_values'] = html_body.apply(process_html_for_how_many_values)
        # #dataframe.loc[:, 'count_values'] = dataframe.loc[:, 'values'].apply(len)
        
        # aditional_columns = [
        #     ['true_ecommerce']
        # ]
        # df_features = get_features_dataframe(dataframe, aditional_columns)

        return dataframe#, df_features
    except Exception as e:
        raise Exception('An error occured while trying to generate features.\nError:\n' + str(e))

# Model Helper Functions

In [None]:
def model_report(score, confusion_matrix, classification_report, model_card):
    
    # Gera o heatmap da confusion matrix
    plt.figure(figsize=(5,5))
    sns.heatmap(confusion_matrix, 
                annot=True, 
                fmt="d", 
                linewidths=.5, 
                square = True, 
                cmap = 'Blues', 
                annot_kws={"size": 16}, 
                xticklabels=['non_ecom', 'ecom'], 
                yticklabels=['non_ecom', 'ecom'])

    plt.xticks(rotation='horizontal', fontsize=16)
    plt.yticks(rotation='horizontal', fontsize=16)
    plt.xlabel('Predicted Label', size=20)
    plt.ylabel('Actual Label', size=20)

    title = 'Accuracy Score: {0:.4f}'.format(score)
    plt.title(title, size = 20)

    # Mostra o classification report e o heatmap
    pprint(classification_report)
    plt.show()

    model_card['accuracy_best'] = round(classification_report['accuracy'], 4)
    model_card['precision_macro_best'] = round(classification_report['macro avg']['precision'], 4)
    model_card['recall_macro_best'] = round(classification_report['macro avg']['recall'], 4)
    model_card['f1_macro_best'] = round(classification_report['macro avg']['f1-score'], 4)
    model_card['support_0_best'] = classification_report['0']['support']
    model_card['support_1_best'] = classification_report['1']['support']

    return model_card

In [None]:
def cross_validate_report(cross_validate_results, model_card):

    # métricas dos modelos gerados no cross validation
    print('accuracy:\t', cross_validate_results['test_accuracy'], ' \tmean: ', cross_validate_results['test_accuracy'].mean())
    print('precision:\t', cross_validate_results['test_precision'], ' \tmean: ', cross_validate_results['test_precision'].mean())
    print('recall:\t\t', cross_validate_results['test_recall'], ' \tmean: ', cross_validate_results['test_recall'].mean())
    print('f1:\t\t', cross_validate_results['test_f1'], ' \tmean: ', cross_validate_results['test_f1'].mean())
    print('fit_time:\t', cross_validate_results['fit_time'], ' \tmean: ', cross_validate_results['fit_time'].mean())
    print('score_time:\t', cross_validate_results['score_time'], ' \tmean: ', cross_validate_results['score_time'].mean())

    max_f1_pos = list(cross_validate_results['test_f1']).index(max(cross_validate_results['test_f1']))
    best_estimator = cross_validate_results['estimator'][max_f1_pos]
    best_indices = {
        'train': cross_validate_results['indices']['train'][max_f1_pos],
        'test': cross_validate_results['indices']['test'][max_f1_pos]
    }

    model_card['accuracy_mean'] = round(cross_validate_results['test_accuracy'].mean(), 4)
    model_card['precision_mean'] = round(cross_validate_results['test_precision'].mean(), 4)
    model_card['recall_mean'] = round(cross_validate_results['test_recall'].mean(), 4)
    model_card['f1_mean'] = round(cross_validate_results['test_f1'].mean(), 4)
    model_card['fit_time_mean'] = round(cross_validate_results['fit_time'].mean(), 4)
    model_card['score_time_mean'] = round(cross_validate_results['score_time'].mean(), 4)

    
    return best_estimator, best_indices, model_card

In [None]:
def save_vectorizer_model(vectorizer, model, model_card):
    try:
        file_name = model_card['scope'] + '_' + model_card['vectorizer'] + '_' + model_card['model'] + '_' + \
            str(model_card['word_reduction']) + '_' + str(model_card['kfold_splits']) + '_' + str(model_card['kfold_shuffle']) + '_' + \
                str(model_card['kfold_random_state']) + '_' + str(model_card['vectorizer_max_features']) + '_' + model_card['dataset']

        model_card['vectorizer_file_name'] = 'VECTORIZER_v1_' + file_name + '.pkl'
        model_card['model_file_name'] = 'MODEL_v1_' + file_name + '.pkl'

        with open('../models/' + model_card['vectorizer_file_name'], 'wb') as file:
            pickle.dump(vectorizer, file)
        with open('../models/' + model_card['model_file_name'], 'wb') as file:
            pickle.dump(model, file)

    except Exception as e:
        print('An error ocurred while trying to save the model. Error: ' + str(e))
        print(model_card)

    return model_card

In [None]:
model_card = {
    'scope': 'ecommerce',
    'vectorizer': '',
    'model': '',
    'word_reduction': 'custom_lemmatizer', 
    'kfold_splits': 3,
    'kfold_shuffle': True,
    'kfold_random_state': 42,
    'vectorizer_max_features': 1000, # None ou INTeger -> testado: None, 50, 100, 200, 500, 1000, 1500, 2000
    'dataset': 'training_data', 
    'accuracy_mean': '',
    'precision_mean': '',
    'recall_mean': '',
    'f1_mean': '',
    'fit_time_mean': '',
    'score_time_mean': '',
    'accuracy_best': '',
    'precision_macro_best': '',
    'recall_macro_best': '',
    'f1_macro_best': '',
    'support_0_best': '',
    'support_1_best': '',
    'vectorizer_file_name': '',
    'model_file_name': ''
}

# metricas utilizadas pela validação cruzada
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score, normalize=True), 
    'precision': make_scorer(precision_score, average='macro', zero_division=0), 
    'recall': make_scorer(recall_score, average='macro'), 
    'f1': make_scorer(f1_score, average='macro')
    }

# CV splitter com StratifiedKFold, para manter proporções de exemplos de cada classe target
stratified_kfold = StratifiedKFold(n_splits=model_card['kfold_splits'], shuffle=model_card['kfold_shuffle'], random_state=model_card['kfold_random_state'])

# Train and Evaluate Model

In [None]:
df = generate_features(df)

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
len(df)

In [None]:
# Multinomial Naive-Bayes com TF-IDF
mnb_tfidf_model_card = model_card.copy()
mnb_tfidf_model_card['vectorizer'] = 'tfidf_vectorizer'
mnb_tfidf_model_card['model'] = 'multinomial_nb'

cv_tfidf_vectorizer = TfidfVectorizer(max_features=model_card['vectorizer_max_features'])
token_strings = [' '.join(doc) for doc in df['tokens']]
tfidf_matrix = cv_tfidf_vectorizer.fit_transform(token_strings)

In [None]:
idf_values = cv_tfidf_vectorizer.idf_
feature_names = cv_tfidf_vectorizer.get_feature_names_out()
feature_importances = pd.DataFrame({'feature': feature_names, 'idf': idf_values})
feature_importances = feature_importances.sort_values(by='idf', ascending=False)
feature_importances

In [None]:
# features = ['html_size', 'qntd_tokens', 'qntd_tokens_unicos', 'qntd_autoreference_links','qntd_links', 'has_cnpj', 'count_prices', 'count_values', 'has_prices']
features = ['has_cnpj', 'has_prices']

# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=cv_tfidf_vectorizer.get_feature_names_out())

# Concatenar os dataframes de features
other_features = df[features]
features_df = pd.concat([other_features, tfidf_df], axis=1)
df_y = df['true_ecommerce']

In [None]:
len(df)

In [None]:
len(other_features)

In [None]:
len(tfidf_df)

In [None]:
features_df

In [None]:
from sklearn.naive_bayes import MultinomialNB

# lgrg_tfidf = LogisticRegression(solver='liblinear')
clf = MultinomialNB()
# lgrg_tfidf = LogisticRegression(solver='lbfgs') # lbfgs é melhor, mas max_iter tava sendo mt alta, n tava convergindo. Testar os hyperparams
# cross validation
cv_results = cross_validate(estimator=clf, X=features_df, y=df_y, 
                            cv=stratified_kfold, scoring=scoring_metrics,
                            return_estimator=True, return_indices=True
                            )

In [None]:
# cross validate report
best_estimator, best_indices, mnb_tfidf_model_card = cross_validate_report(cv_results, mnb_tfidf_model_card)

In [None]:
import numpy as np

THRESHOLD = 0.5

# best model report
best_indices_test_X = features_df.iloc[best_indices['test']]
best_indices_test_Y = df_y.iloc[best_indices['test']]

model_predictions_prob = best_estimator.predict_proba(best_indices_test_X)
y_probs_0, y_probs_1 = zip(*model_predictions_prob)
y_probs_0 = list(y_probs_0)
y_probs_1 = list(y_probs_1)
best_estimator_predictions = np.asarray(list(map(lambda x: int(x >= THRESHOLD), y_probs_1)))

# best_estimator_score = best_estimator.score(X=best_indices_test_X, y=best_indices_test_Y)
best_estimator_score = accuracy_score(y_true=best_indices_test_Y, y_pred=best_estimator_predictions)
best_estimator_score_f1 = f1_score(y_true=best_indices_test_Y, y_pred=best_estimator_predictions)

best_estimator_cmatrix = confusion_matrix(y_pred=best_estimator_predictions, y_true=best_indices_test_Y)
best_estimator_creport = classification_report(y_pred=best_estimator_predictions, y_true=best_indices_test_Y, zero_division=0, output_dict=True)

mnb_tfidf_model_card = model_report(best_estimator_score, best_estimator_cmatrix, best_estimator_creport, mnb_tfidf_model_card)

# save model
mnb_tfidf_model_card = save_vectorizer_model(cv_tfidf_vectorizer, clf, mnb_tfidf_model_card)

print(f"F1-Score: {round(best_estimator_score_f1, 4)}")
print(mnb_tfidf_model_card)


In [None]:
pprint(best_estimator_cmatrix)

## Comparando com o modelo atual

In [None]:
import pickle

last_model_path = "../models/MODEL_ecommerce_tfidf_vectorizer_mnb_custom_lemmatizer_3_True_42_1000_spiderwebv4_dataset_html.pkl"
last_vectorizer_path = "../models/VECTORIZER_ecommerce_tfidf_vectorizer_mnb_custom_lemmatizer_3_True_42_1000_spiderwebv4_dataset_html.pkl"

with open(last_vectorizer_path, "rb") as f:
    last_version_vectorizer = pickle.load(f)

with open(last_model_path, "rb") as f:
    last_version_model = pickle.load(f)

token_strings = [' '.join(doc) for doc in df['tokens']]
tfidf_matrix = last_version_vectorizer.transform(token_strings)

idf_values = last_version_vectorizer.idf_

feature_names = last_version_vectorizer.get_feature_names_out()
feature_importances = pd.DataFrame({'feature': feature_names, 'idf': idf_values})
feature_importances = feature_importances.sort_values(by='idf', ascending=False)

# features = ['html_size', 'qntd_tokens', 'qntd_tokens_unicos', 'qntd_autoreference_links','qntd_links', 'has_cnpj', 'count_prices', 'count_values', 'has_prices']
features = ['has_cnpj', 'has_prices']

# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=last_version_vectorizer.get_feature_names_out())

# Concatenar os dataframes de features
other_features = df[features]
features_df = pd.concat([other_features, tfidf_df], axis=1)
df_y = df['true_ecommerce']

# best model report
best_indices_test_X = features_df.iloc[best_indices['test']]
best_indices_test_Y = df_y.iloc[best_indices['test']]

model_predictions_prob = last_version_model.predict_proba(best_indices_test_X)
y_probs_0, y_probs_1 = zip(*model_predictions_prob)
y_probs_0 = list(y_probs_0)
y_probs_1 = list(y_probs_1)
current_estimator_predictions = np.asarray(list(map(lambda x: int(x >= THRESHOLD), y_probs_1)))

# best_estimator_score = best_estimator.score(X=best_indices_test_X, y=best_indices_test_Y)
current_estimator_score = accuracy_score(y_true=best_indices_test_Y, y_pred=current_estimator_predictions)
current_estimator_score_f1 = f1_score(y_true=best_indices_test_Y, y_pred=current_estimator_predictions)

current_estimator_cmatrix = confusion_matrix(y_pred=current_estimator_predictions, y_true=best_indices_test_Y)
current_estimator_creport = classification_report(y_pred=current_estimator_predictions, y_true=best_indices_test_Y, zero_division=0, output_dict=True)

print(f"F1-Score: {round(current_estimator_score_f1, 4)}")

# Gera o heatmap da confusion matrix
plt.figure(figsize=(5,5))
sns.heatmap(current_estimator_cmatrix, 
            annot=True, 
            fmt="d", 
            linewidths=.5, 
            square = True, 
            cmap = 'Blues', 
            annot_kws={"size": 16}, 
            xticklabels=['non_ecom', 'ecom'], 
            yticklabels=['non_ecom', 'ecom'])

plt.xticks(rotation='horizontal', fontsize=16)
plt.yticks(rotation='horizontal', fontsize=16)
plt.xlabel('Predicted Label', size=20)
plt.ylabel('Actual Label', size=20)

title = 'Accuracy Score: {0:.4f}'.format(current_estimator_score)
plt.title(title, size = 20)

# Mostra o classification report e o heatmap
pprint(current_estimator_creport)
plt.show()


In [None]:
pprint(current_estimator_cmatrix)

In [None]:
analysis_df = df.iloc[best_indices_test_Y.index.tolist()].copy()
analysis_df["prediction"] = best_estimator_predictions.tolist()
analysis_df = analysis_df[["domain", "html", "tokens", "true_ecommerce", "prediction"]]
analysis_df = analysis_df[(analysis_df["true_ecommerce"] == 0) & (analysis_df["prediction"] == 1)]
analysis_df

# Train full model

In [38]:
clf = MultinomialNB()
clf.fit(features_df, df_y)
mnb_tfidf_model_card = save_vectorizer_model(cv_tfidf_vectorizer, clf, mnb_tfidf_model_card)

# Test on validated data

In [None]:
df = pd.read_csv('../data/countries_predictions_sample.csv')

In [None]:
df = df[['domain','html']]

In [None]:
df = generate_features(df)

In [None]:
token_strings = [' '.join(doc) for doc in df['tokens']]
tfidf_matrix = cv_tfidf_vectorizer.transform(token_strings)

In [None]:
features = ['has_cnpj', 'has_prices']

# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=cv_tfidf_vectorizer.get_feature_names_out())

# Concatenar os dataframes de features
other_features = df[features]
features_df = pd.concat([other_features, tfidf_df], axis=1)

In [None]:
features_df.head()

In [None]:
def predict_proba_with_domain(X, estimator, vectorizer, threshold=0.5):
    model_predictions_prob = estimator.predict_proba(X)

    y_probs_0, y_probs_1 = zip(*model_predictions_prob)
    y_probs_0 = list(y_probs_0)
    y_probs_1 = list(y_probs_1)
    y_preds = list(map(lambda x: int(x >= threshold), y_probs_1))

    return y_preds, y_probs_0, y_probs_1

In [None]:
y_preds, y_probs_0, y_probs_1 = predict_proba_with_domain(features_df, clf, cv_tfidf_vectorizer, threshold=THRESHOLD)

In [None]:
df['pred_0_prob'] = y_probs_0
df['pred_1_prob'] = y_probs_1
df['prediction'] = y_preds

In [None]:
# df.to_csv('../data/countries_predictions_sample_2.csv')