In [1]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import unicodedata
import pickle
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.model_selection import train_test_split

In [None]:
df_validated = pd.read_csv('../data/countries_predictions_sample_simple_validated.csv')
df = pd.read_csv('../data/countries_predictions_sample_2.csv')

In [27]:
df_validated = df_validated[['domain','true_prediction','notes']]

In [28]:
df = df.merge(df_validated, on=['domain'])

In [29]:
len(df)

96

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,domain,html,tokens,processed_cnpjs,has_cnpj,count_prices,has_prices,pred_0_prob,pred_1_prob,prediction,true_prediction,notes
0,0,corpoeenergia.com.br,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...,"['java', 'script', 'parecer', 'desabilitar', '...",['17155356000105'],True,49,True,0.00017,0.99983,1,True,
1,1,donnalimao.com.br,"<!doctype html>\n<html lang=""pt-br"" data-tray-...","['menu', 'vendido', 'lancamentos', 'conjunto',...",[],False,51,True,0.001667,0.998333,1,True,
2,2,insumosdailha.com.br,"<!DOCTYPE html><html lang=""pt-BR"" xmlns:og=""ht...","['fretar', 'gratis', 'brindar', 'exclusivo', '...",['47542679000104'],True,31,True,0.000566,0.999434,1,True,
3,3,primelive.com.br,"<!doctype html>\n<html lang=""pt-br"" class=""pag...","['primar', 'live', 'ajudar', 'suportar', 'cont...",['03897520000137'],True,49,True,0.000266,0.999734,1,True,
4,4,santodesejosexshop.com.br,"<!doctype html>\n<html lang=""pt-BR"">\n<head>\n...","['home', 'contar', 'carro', 'politicar', 'troc...",[],False,27,True,0.003864,0.996136,1,,


In [46]:
df = df.dropna(axis=0, subset='true_prediction')

In [31]:
df_wrong = df.loc[df['prediction'] != df['true_prediction']]
df_wrong

Unnamed: 0.1,Unnamed: 0,domain,html,tokens,processed_cnpjs,has_cnpj,count_prices,has_prices,pred_0_prob,pred_1_prob,prediction,true_prediction,notes
4,4,santodesejosexshop.com.br,"<!doctype html>\n<html lang=""pt-BR"">\n<head>\n...","['home', 'contar', 'carro', 'politicar', 'troc...",[],False,27,True,0.003864,0.996136,1,,
12,12,santapaciencia.com.br,<!-- This page is cached by the Hummingbird Pe...,"['skip', 'content', 'facebook', 'instagram', '...",['31381211000145'],True,14,True,0.002818,0.997182,1,,
16,16,programapecamais.com.br,"\r\n\r\n<!doctype html>\n<html lang=""pt-BR"" cl...","['paginar', 'inicial', 'catalogar', 'produto',...",[],False,0,False,0.28562,0.71438,1,False,Login page
25,25,ladyfetish.com.br,"<!DOCTYPE html>\n<html lang=""pt"">\n<head>\n \...","['top', 'page', 'iniciar', 'loja', 'contato', ...",[],False,38,True,0.007775,0.992225,1,,
34,34,postoeuropa.com.br,"<!doctype html >\n<html lang=""en"" id=""estilo_p...","['please', 'enable', 'java', 'script', 'contin...",[],False,0,False,0.458468,0.541532,1,False,Login page
40,40,santaclaramoveis.com.br,"<!DOCTYPE html>\r\n<html class=""no-touch"" lang...","['produto', 'armarios', 'bercos', 'cadeira', '...",[],False,0,False,0.479948,0.520052,1,False,"Não é ecomm, mas tem produtos, permite carrinh..."
42,42,adecomdistribuidora.com.br,<!DOCTYPE html>\n<!--[if IE 9 ]> <html class...,"['adecom', 'ruir', 'anne', 'frank', 'hauer', '...",[],False,0,False,0.404961,0.595039,1,False,"Não é ecomm, mas tem produtos, permite carrinh..."
48,48,cafeparacriativos.com.br,"<!DOCTYPE html>\n<html lang=""pt-BR"" class=""no-...","['iniciar', 'sobrar', 'produto', 'loja', 'blog...",[],False,0,False,0.891333,0.108667,0,True,
54,54,metropoleimoveisata.com.br,"<!DOCTYPE html>\r\n<html lang=""pt-br"">\r\n<hea...","['barao', 'triunfar', 'aracatuba', 'contato', ...",[],False,23,True,0.114849,0.885151,1,False,
58,58,acrerifas.com.br,"<!DOCTYPE html><html lang=""pt-BR""><head><scrip...","['acrerifas', 'oficial', 'acre', 'premiacoes',...",[],False,1,False,0.7893,0.2107,0,,


In [None]:
import html
import re

def clean_html_string(html_str):
    # Primeiro, decodifique os caracteres de escape HTML
    decoded_str = html.unescape(html_str)
    
    # Em seguida, substitua \r, \n e \t por espaços em branco
    clean_str = re.sub(r'[\r\n\t]', ' ', decoded_str)
    
    # Opcionalmente, você pode remover múltiplos espaços consecutivos
    clean_str = re.sub(r'\s+', ' ', clean_str).strip()
    
    return clean_str

# clean_html_string(df.loc[df['domain'] == 'programapecamais.com.br']['html'].values[0])
clean_html_string(df.loc[df['domain'] == 'postoeuropa.com.br']['html'].values[0])

# Build model features on wrongly predicted data to assert reasoning

In [None]:
# Pre processamento
STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))
lemmatizer = WordNetLemmatizer()

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

def custom_tokenizer(html_text):
    try:
        # pegar somente o body do HTML
        soup = BeautifulSoup(html_text, "html.parser")
        text = soup.body.get_text() if soup.body else ""

        # pre processamento do texto
        # normalizar
        preprocessed_text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()
  
        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]
        
        # Aplicar lemmatizer
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        return tokens

    except Exception as e:
        print(e)
        print("Failed on custom_tokenizer, passing...")
        return []

In [None]:
# Open picked model
serialized_model = open('../models/BEST_MODEL_ecomm_tfidf_vectorizer_ecomm_logistic_regression_lbfgs_lemmatizer_3_True_42_1000_ecomm_spiderwebv4_dataset_html.pkl', "rb")
model = pickle.load(serialized_model)
serialized_model.close()
# Open picked vectorizer
serialized_vectorizer = open('../models/BEST_VECTORIZER_ecomm_tfidf_vectorizer_ecomm_logistic_regression_lbfgs_lemmatizer_3_True_42_1000_ecomm_spiderwebv4_dataset_html.pkl', "rb")
vectorizer = pickle.load(serialized_vectorizer)
serialized_vectorizer.close()

In [None]:
vectorized_html = vectorizer.transform(df_wrong['html'])

In [32]:
len(df_wrong)

13

In [None]:
vectorized_html.shape[0]

In [None]:
df_wrong['html_tokens'] = df_wrong['html'].apply(custom_tokenizer)

In [None]:
idf_values = vectorizer.idf_
feature_names = vectorizer.get_feature_names_out()
vectors = []
for row_idx, row in enumerate(vectorized_html.toarray()):
    vector_dict = {}
    for col_idx, tfidf_value in enumerate(row):
        if tfidf_value != 0.0:
            feature_name = feature_names[col_idx]
            idf_value = idf_values[col_idx]
            vector_dict[feature_name] = {'tfidf': tfidf_value, 'idf': idf_value}
    
    sorted_vector_dict = {k: v for k, v in sorted(vector_dict.items(), key=lambda item: item[1]['tfidf'], reverse=True)}
    vectors.append(sorted_vector_dict)

# Adicionar os vetores ao dataframe original
df_wrong['vectors'] = vectors

In [None]:
df_wrong

In [33]:
domains = df_wrong['domain'].tolist()
selected_domain = domains[2]
print(selected_domain)
pprint(df_wrong.loc[df['domain'] == selected_domain]['html_tokens'].tolist())
(df_wrong.loc[df['domain'] == selected_domain]['vectors'].values)[0]

programapecamais.com.br


KeyError: 'html_tokens'

In [35]:
df

Unnamed: 0.1,Unnamed: 0,domain,html,tokens,processed_cnpjs,has_cnpj,count_prices,has_prices,pred_0_prob,pred_1_prob,prediction,true_prediction,notes
0,0,corpoeenergia.com.br,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...,"['java', 'script', 'parecer', 'desabilitar', '...",['17155356000105'],True,49,True,0.000170,0.999830,1,True,
1,1,donnalimao.com.br,"<!doctype html>\n<html lang=""pt-br"" data-tray-...","['menu', 'vendido', 'lancamentos', 'conjunto',...",[],False,51,True,0.001667,0.998333,1,True,
2,2,insumosdailha.com.br,"<!DOCTYPE html><html lang=""pt-BR"" xmlns:og=""ht...","['fretar', 'gratis', 'brindar', 'exclusivo', '...",['47542679000104'],True,31,True,0.000566,0.999434,1,True,
3,3,primelive.com.br,"<!doctype html>\n<html lang=""pt-br"" class=""pag...","['primar', 'live', 'ajudar', 'suportar', 'cont...",['03897520000137'],True,49,True,0.000266,0.999734,1,True,
4,4,santodesejosexshop.com.br,"<!doctype html>\n<html lang=""pt-BR"">\n<head>\n...","['home', 'contar', 'carro', 'politicar', 'troc...",[],False,27,True,0.003864,0.996136,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,91,roccianera.com.br,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...,"['empresar', 'material', 'galeria', 'cliente',...",[],False,0,False,0.901739,0.098261,0,False,
92,92,eblog.com.br,<!DOCTYPE html>\n<html class='ltr' dir='ltr' x...,"['home', 'sobrar', 'apoiar', 'pequeno', 'negoc...",[],False,6,True,0.975012,0.024988,0,False,
93,93,blogtherainbow.blogspot.com.br,<!DOCTYPE html>\n<html class='v2' dir='ltr' la...,"['sabado', 'outubro', 'diy', 'cabeceira', 'cam...",[],False,0,False,0.999544,0.000456,0,False,
94,94,netfilmesmega.blogspot.com.br,<!DOCTYPE html>\n<html class='v2' dir='ltr' xm...,"['paginar', 'paginar', 'inicial', 'filme', 'se...",[],False,0,False,0.998956,0.001044,0,False,


In [43]:
df.loc[df['pred_1_prob'] >= range]

Unnamed: 0.1,Unnamed: 0,domain,html,tokens,processed_cnpjs,has_cnpj,count_prices,has_prices,pred_0_prob,pred_1_prob,prediction,true_prediction,notes
0,0,corpoeenergia.com.br,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...,"['java', 'script', 'parecer', 'desabilitar', '...",['17155356000105'],True,49,True,0.00017,0.99983,1,True,
1,1,donnalimao.com.br,"<!doctype html>\n<html lang=""pt-br"" data-tray-...","['menu', 'vendido', 'lancamentos', 'conjunto',...",[],False,51,True,0.001667,0.998333,1,True,
2,2,insumosdailha.com.br,"<!DOCTYPE html><html lang=""pt-BR"" xmlns:og=""ht...","['fretar', 'gratis', 'brindar', 'exclusivo', '...",['47542679000104'],True,31,True,0.000566,0.999434,1,True,
3,3,primelive.com.br,"<!doctype html>\n<html lang=""pt-br"" class=""pag...","['primar', 'live', 'ajudar', 'suportar', 'cont...",['03897520000137'],True,49,True,0.000266,0.999734,1,True,
4,4,santodesejosexshop.com.br,"<!doctype html>\n<html lang=""pt-BR"">\n<head>\n...","['home', 'contar', 'carro', 'politicar', 'troc...",[],False,27,True,0.003864,0.996136,1,,
5,5,pedrinhosports.com.br,"<!doctype html>\n<html lang=""pt-br"">\n<head>\n...","['social', 'instagram', 'lista', 'desejo', 'pe...",['03820484000103'],True,1,False,0.003672,0.996328,1,True,
6,6,adaptogen.com.br,"<!DOCTYPE html>\n<html class=""no-js"" lang=""pt-...","['parcelar', 'atar', 'juro', 'cartao', 'credit...",['02844238000129'],True,48,True,0.000586,0.999414,1,True,
7,7,shoptalkmoda.com.br,"<!DOCTYPE html><html lang=""pt-BR""><head><meta ...","['lancamentos', 'acessorios', 'vestuario', 'bl...",['24158860000116'],True,14,True,0.000273,0.999727,1,True,
8,8,pontodaeletronica.com.br,<!DOCTYPE html>\n<!--[if lt IE 7]> <html ...,"['java', 'script', 'parecer', 'desabilitar', '...",['05701783000154'],True,43,True,0.001064,0.998936,1,True,
9,9,revivcestasbasicas.com.br,<!DOCTYPE html>\r\n<html data-pnotify-firstpos...,"['contato', 'entrar', 'menu', 'entrar', 'compa...",['00975860000140'],True,8,True,0.006269,0.993731,1,True,


In [52]:
range = 0.5
print(len(df.loc[df['pred_1_prob'] < range]))
df.loc[df['pred_1_prob'] < range].value_counts('true_prediction')

50


true_prediction
False    49
True      1
Name: count, dtype: int64

# Confiance Report -> Range (correct/total)
## 0.9 - 1.0
27/27 -> 100%
## 0.8 - 1.0
28/29 -> 96.6%
## 0.7 - 1.0
29/31 -> 93.5%
## 0.6 - 1.0
33/35 -> 94.3%
## 0.5 - 1.0
34/40 -> 85.0%
## 0.0 - 0.5
49/50 -> 98.0%

# Confiance Report -> Range (correct/total)
## 0.9 - 1.0
9/9 -> 100%
## 0.8 - 1.0
17/18 -> 94.4%
## 0.7 - 1.0
25/27 -> 92.6%
## 0.6 - 1.0
31/37 -> 83.8%
## 0.5 - 1.0
35/47 -> 74.5%
## 0.0 - 0.5
43/43 -> 100%

In [None]:
idf_values = vectorizer.idf_
feature_importances = pd.DataFrame({'feature': feature_names, 'idf': idf_values})
feature_importances = feature_importances.sort_values(by='idf', ascending=False)
feature_importances

In [None]:
feature_importances.head(60)