In [None]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import unicodedata
import pickle
import numpy as np
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

In [None]:
df = pd.read_parquet("../data/new_iugu_saas2_with_html.parquet", engine="pyarrow")
df["html"] = df["html"].astype(str)
df.head(5)

In [None]:
df.shape

In [None]:
df['host'].nunique()

In [None]:
df['Segmento iugu'].value_counts()

In [None]:
# df['Nicho Tech'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df = df[["host", "url", "html", "Segmento iugu"]]
df = df.rename(columns={"Segmento iugu": "segment"})
df.head()

# Feature Engineering

In [None]:
def check_integrity(dataframe):
    try:
        columns_expected = [
            'host',
            'html',
            'url',
        ]
        
        if not all(item in dataframe.columns.tolist() for item in columns_expected):
            raise Exception('Missing required columns. Columns expected:\n' + str(columns_expected))
        
        dataframe['html'] = dataframe['html'].astype(str)

        dataframe_filtered = dataframe[(dataframe['html'] != '[]') & 
                                (dataframe['html'] != '')]
    
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with empty HTML and/or does not ends with '.br'. Removing those entries.")
            dataframe = dataframe_filtered

        dataframe_filtered = dataframe.drop_duplicates(subset=["host"])
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with duplicates values. Removing those entries.")
            dataframe = dataframe_filtered
    
        nulls = dataframe['host'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'host' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['host'])

        nulls = dataframe['url'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'url' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['url'])

        nulls = dataframe['html'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'html' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['html'])
        
        return dataframe
    except Exception as e:
        raise Exception('Failed in integrity check.\nError:\n' + str(e))

In [None]:
def build_lemmatizer_pt_dict():
    try:
        import os
        import requests
        
        url = "https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt"
        file_name = "lemmatization-pt.txt"

        # Verificar se o arquivo já existe
        if not os.path.exists(file_name):
            response = requests.get(url)
            with open(file_name, 'wb') as f:
                f.write(response.content)

        # Processar o arquivo
        lemmatizer_pt_dict = {}
        with open(file_name, 'r') as dic:
            for line in dic:
                txt = line.split()
                if len(txt) == 2:
                    lemmatizer_pt_dict[txt[1]] = txt[0]

        return lemmatizer_pt_dict
    except Exception as e:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

    finally:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)


In [None]:
def custom_lemmatizer(tokens, lemmatizer_pt_dict):
    try:
      from nltk.stem.wordnet import WordNetLemmatizer
  
      lemmatizer = WordNetLemmatizer()
      tokens_lemmatized = []
      for token in tokens:
        if token in lemmatizer_pt_dict.keys():
          tokens_lemmatized.append(lemmatizer_pt_dict.get(token))
        else:
          tokens_lemmatized.append(lemmatizer.lemmatize(token))

      return tokens_lemmatized
    except Exception as e:
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [None]:
def process_html_for_vectorizer(html_text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    from bs4 import BeautifulSoup
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    try:              
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))

        # pegar somente o body do HTML
        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.body.get_text() if soup.body else ''

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remover espaços em branco e quebras de linha desnecessárias
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # Regex para identificar palavras
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')

        # Substituir as correspondências por elas mesmas precedidas por um espaço
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()

        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]

        # Aplicar lemmatizer
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

In [None]:
from bs4 import BeautifulSoup
import re
def process_html_for_how_many_prices(text):
    try:              
        regex_precos = re.compile(r'\$|R\$')
        precos = regex_precos.findall(text)
        return len(precos)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for prices.\nError:\n' + str(e))

def process_html_for_how_many_values(text):
    try:              
        regex_valores = re.compile(r'\d+(?:\.\d{3})*(?:,\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?')
        valores = regex_valores.findall(text)
        return len(valores)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for values.\nError:\n' + str(e))

def get_html_body(html_str):
    try:
        # Tentar usar diferentes parsers
        for parser in ['html.parser', 'html5lib', 'lxml']:
            try:
                soup = BeautifulSoup(html_str, parser)
                text = soup.body.get_text() if soup.body else ''
                return text
            except Exception as parser_e:
                continue
        
    except Exception as e:
        return ''

In [None]:
def only_number(text):
    text = re.sub(r'[^\d]', '', text)
    return text

def remove_invalid_company(company_id):
    company_id = re.sub(r'(\d)\1{12}', '', company_id)
    if len(company_id) == 14:
        return company_id
    return None 

def order_by_common(data):
    from collections import Counter
    data_output = Counter(data)
    return [k for k, v in data_output.most_common()]

def extract_and_process_cnpjs(text):
    pattern = re.compile(r'\d{2}\.\d{3}\.\d{3}[\/ ]\d{4}[- ]\d{2}')
    matches = pattern.findall(text)
    processed_matches = []
    for match in matches:
        cleaned = only_number(match)
        valid_company = remove_invalid_company(cleaned)
        if valid_company:
            processed_matches.append(valid_company)
    return processed_matches

In [None]:
def generate_features(dataframe):
    try:
        dataframe = check_integrity(dataframe)

        lem_dict = build_lemmatizer_pt_dict()
        dataframe.loc[:, 'tokens'] = dataframe.loc[:, 'html'].apply(lambda x: process_html_for_vectorizer(x, lem_dict))

        return dataframe
    except Exception as e:
        raise Exception('An error occured while trying to generate features.\nError:\n' + str(e))

In [None]:
def report(score, confusion_matrix, classification_report, classes):
    # Gera o heatmap da confusion matrix
    plt.figure(figsize=(8, 5))
    sns.heatmap(confusion_matrix, 
                annot=True, 
                fmt="d", 
                linewidths=.5, 
                square = True, 
                cmap = 'Blues', 
                annot_kws={"size": 16}, 
                xticklabels=classes, 
                yticklabels=classes)

    plt.xticks(rotation='horizontal', fontsize=16)
    plt.yticks(rotation='horizontal', fontsize=16)
    plt.xlabel('Predicted Label', size=20)
    plt.ylabel('Actual Label', size=20)

    title = 'Accuracy Score: {0:.4f}'.format(score)
    plt.title(title, size = 20)

    # Mostra o classification report e o heatmap
    pprint(classification_report)
    plt.show()

In [None]:
def save_features(vectorizer, vectors):
    try:
        file_name = "iugu_tfidf_similarity_1000_base_iugu"

        vectorizer_file_name = 'VECTORIZER_v1_' + file_name + '.pkl'
        vectors_file_name = 'EMBEDDINGS_v1_' + file_name + '.pkl'

        with open('../models/' + vectorizer_file_name, 'wb') as file:
            pickle.dump(vectorizer, file)

        with open('../models/' + vectors_file_name, 'wb') as file:
            pickle.dump(vectors, file)

    except Exception as e:
        print('An error ocurred while trying to save the model. Error: ' + str(e))

# Train and Evaluate Model

In [None]:
df = generate_features(df)
df = df.reset_index(drop=True)
df["len_tokens"] = df["tokens"].apply(lambda x: len(x))
df = df[df["len_tokens"] > 1].reset_index(drop=True)
df = df.drop(columns=["len_tokens"])
df.head(30)

In [None]:
# # não estão pegando os tokens:
# df = df[~df["host"].isin(["uoon.com.br", "psicomanager.com.br"])].reset_index(drop=True)
# df.head(30)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.head(30)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df["segment"])
test_df

In [None]:
test_indexes = test_df.index.tolist()

df_original = df.copy()
df = df[~df.index.isin(test_indexes)].reset_index(drop=True)
df.head(30)

In [None]:
cv_tfidf_vectorizer = TfidfVectorizer(max_features=1000)
token_strings = [' '.join(doc) for doc in df['tokens']]
tfidf_matrix = cv_tfidf_vectorizer.fit_transform(token_strings)

In [None]:
idf_values = cv_tfidf_vectorizer.idf_
feature_names = cv_tfidf_vectorizer.get_feature_names_out()
feature_importances = pd.DataFrame({'feature': feature_names, 'idf': idf_values})
feature_importances = feature_importances.sort_values(by='idf', ascending=False)
feature_importances

In [None]:
# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=cv_tfidf_vectorizer.get_feature_names_out())

df = pd.concat([tfidf_df, df["segment"]], axis=1)
df_y = df["segment"].values.tolist()
df

In [None]:
mean_vector_by_segment = df.groupby("segment").agg("mean")
mean_vector_by_segment = mean_vector_by_segment.iloc[mean_vector_by_segment.index.argsort()]
mean_vector_by_segment

In [None]:
mean_vector_by_segment.index.to_list()

In [None]:
test_df = df_original[df_original.index.isin(test_indexes)].reset_index(drop=True)

token_strings_test = [' '.join(doc) for doc in test_df['tokens']]
tfidf_matrix_test = cv_tfidf_vectorizer.transform(token_strings_test)

tfidf_df_test = pd.DataFrame(tfidf_matrix_test.toarray(), columns=cv_tfidf_vectorizer.get_feature_names_out())
tfidf_df_test

In [None]:
def calculate_distances(mean_vector, candidate_vectors):
    mean_vector_values = np.asarray(mean_vector_by_segment.values.tolist())
    candidate_vectors = candidate_vectors.values.tolist()
    segments = mean_vector.index.to_list()
    labels = []
    probabilities = []

    for vector in candidate_vectors:
        distances = np.linalg.norm(mean_vector_values - np.asarray(vector), axis=1)
        min_distance = np.argmin(distances)
        labels.append(segments[min_distance])
        probabilities.append(tuple((1/distances) / sum(1/distances)))
    
    # labels = np.asarray(labels).reshape(-1, 1).tolist()
    return probabilities, labels

In [None]:
probabilities, labels_predictions = calculate_distances(
    mean_vector_by_segment,
    tfidf_df_test,
)

In [None]:
probabilities

In [None]:
classes = mean_vector_by_segment.index.to_list()
classes

In [None]:
df_y = test_df["segment"].values.tolist()

# best_estimator_score = best_estimator.score(X=best_indices_test_X, y=best_indices_test_Y)
best_estimator_score = accuracy_score(y_true=df_y, y_pred=labels_predictions)
best_estimator_score_f1 = f1_score(y_true=df_y, y_pred=labels_predictions, average="weighted")

best_estimator_cmatrix = confusion_matrix(y_pred=labels_predictions, y_true=df_y)
best_estimator_creport = classification_report(y_pred=labels_predictions, y_true=df_y, zero_division=0, output_dict=True)

print(f"F1-Score: {round(best_estimator_score_f1, 4)}")
report(best_estimator_score, best_estimator_cmatrix, best_estimator_creport, classes)

In [None]:
save_features(cv_tfidf_vectorizer, mean_vector_by_segment)

# Test

In [None]:
test_df = pd.read_parquet("../data/iugu.parquet", engine="pyarrow")
test_df["html"] = test_df["html"].astype(str)
test_df.head(5)

In [None]:
df_y_original = test_df["Segmento iugu"]
df_y_original

In [None]:
test_df.shape

In [None]:
test_df["host"].nunique()

In [None]:
test_df["url"].nunique()

In [None]:
test_df.isnull().sum()

In [None]:
test_df = test_df[["host", "url", "html"]]
test_df.head()

In [None]:
test_df = generate_features(test_df)
test_df = test_df.reset_index(drop=True)
test_df.head(40)

In [None]:
# não estão pegando os tokens:
test_df = test_df[~test_df["host"].isin(["uoon.com.br", "psicomanager.com.br"])]
test_df.head(30)

In [None]:
df_y = df_y_original[df_y_original.index.isin(test_df.index.tolist())]
df_y

In [None]:
# # não estão pegando os tokens (e às vezes o HTML também)
# exclude_hosts = [
#     "vibx.com.br",
#     "contabilizei.com.br",
#     "vittude.com.br",
#     "viatechinfo.com.br",
#     "grupotravessia.com",
#     "cursobeta.com.br",
#     "supergeeks.com.br",
#     "cursoyes.com.br",
#     "braip.com",
#     "kalyst.com.br",
#     "plataforma.edibrasil.org",
# ]
# test_df = test_df[~test_df["host"].isin(exclude_hosts)].reset_index(drop=True)
# test_df

In [None]:
vectorizer_path = "../models/VECTORIZER_v1_iugu_tfidf_similarity_1000_eb_vinculados.pkl"
embeddings_path = "../models/EMBEDDINGS_v1_iugu_tfidf_similarity_1000_eb_vinculados.pkl"

with open(vectorizer_path, "rb") as f:
    vectorizer = pickle.load(f)

with open(embeddings_path, "rb") as f:
    embeddings = pickle.load(f)

In [None]:
token_strings = [' '.join(doc) for doc in test_df['tokens']]
tfidf_matrix = vectorizer.transform(token_strings)

# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

In [None]:
embeddings

In [None]:
probabilities, labels_predictions = calculate_distances(
    embeddings,
    tfidf_df,
)

In [None]:
y_probs_0, y_probs_1, y_probs_2 = zip(*probabilities)
y_probs_0 = list(y_probs_0)
y_probs_1 = list(y_probs_1)
y_probs_2 = list(y_probs_2)

In [None]:
test_df["prob_Educação"] = y_probs_0
test_df["prob_SaaS"] = y_probs_1
test_df["prob_Saúde"] = y_probs_2
test_df["prediction"] = labels_predictions
test_df

In [None]:
pprint(f1_score(
    y_pred=test_df["prediction"].values.tolist(),
    y_true=df_y,
    average="weighted",
)); print()

pprint(classification_report(
    y_pred=test_df["prediction"].values.tolist(),
    y_true=df_y,
))