In [None]:
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import os
import unicodedata
import pickle
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

In [None]:
df = pd.read_parquet("../data/iugu.parquet", engine="pyarrow")
df["html"] = df["html"].astype(str)
df["raiz_cnpj"] = df["raiz_cnpj"].astype(int)
df.head(5)

In [None]:
df.shape

In [None]:
linkedin_normalizado = pd.read_csv("/media/greca/HD/Driva/linkedin_normalizado_202505021514.csv")
linkedin_normalizado.head(5)

In [None]:
df = df.merge(linkedin_normalizado, on="raiz_cnpj", how="left")
df = df[["url_x", "host", "html", "raiz_cnpj", "cnpj", "Nome da empresa", "Nicho Tech", "Segmento iugu", "url_y", "sobre", "slogan", "area_atuacao"]]
df = df.rename(columns={"url_x": "url", "url_y": "linkedin_url"})
df

In [None]:
df['host'].nunique()

In [None]:
df['Segmento iugu'].value_counts()

In [None]:
df['Nicho Tech'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df = df[["host", "url", "html", "sobre", "slogan", "area_atuacao", "Segmento iugu"]]
df = df.rename(columns={"Segmento iugu": "segment", "sobre": "about", "area_atuacao": "field"})
df.head()

In [None]:
df["about_is_null"] = df["about"].apply(lambda x: int(pd.isna(x)))
df["slogan_is_null"] = df["slogan"].apply(lambda x: int(pd.isna(x)))
df["field_is_null"] = df["field"].apply(lambda x: int(pd.isna(x)))
df

# Feature Engineering

In [None]:
def check_integrity(dataframe):
    try:
        columns_expected = [
            'host',
            'html',
            'url',
        ]
        
        if not all(item in dataframe.columns.tolist() for item in columns_expected):
            raise Exception('Missing required columns. Columns expected:\n' + str(columns_expected))
        
        dataframe['html'] = dataframe['html'].astype(str)

        dataframe_filtered = dataframe[(dataframe['html'] != '[]') & 
                                (dataframe['html'] != '')]
    
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with empty HTML and/or does not ends with '.br'. Removing those entries.")
            dataframe = dataframe_filtered

        dataframe_filtered = dataframe.drop_duplicates(subset=["host"])
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with duplicates values. Removing those entries.")
            dataframe = dataframe_filtered
    
        nulls = dataframe['host'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'host' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['host'])

        nulls = dataframe['url'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'url' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['url'])

        nulls = dataframe['html'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'html' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['html'])
        
        return dataframe
    except Exception as e:
        raise Exception('Failed in integrity check.\nError:\n' + str(e))

In [None]:
def build_lemmatizer_pt_dict():
    try:
        import os
        import requests
        
        url = "https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt"
        file_name = "lemmatization-pt.txt"

        # Verificar se o arquivo já existe
        if not os.path.exists(file_name):
            response = requests.get(url)
            with open(file_name, 'wb') as f:
                f.write(response.content)

        # Processar o arquivo
        lemmatizer_pt_dict = {}
        with open(file_name, 'r') as dic:
            for line in dic:
                txt = line.split()
                if len(txt) == 2:
                    lemmatizer_pt_dict[txt[1]] = txt[0]

        return lemmatizer_pt_dict
    except Exception as e:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

    finally:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)


In [None]:
def custom_lemmatizer(tokens, lemmatizer_pt_dict):
    try:
      from nltk.stem.wordnet import WordNetLemmatizer
  
      lemmatizer = WordNetLemmatizer()
      tokens_lemmatized = []
      for token in tokens:
        if token in lemmatizer_pt_dict.keys():
          tokens_lemmatized.append(lemmatizer_pt_dict.get(token))
        else:
          tokens_lemmatized.append(lemmatizer.lemmatize(token))

      return tokens_lemmatized
    except Exception as e:
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [None]:
def process_html_for_vectorizer(text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    from bs4 import BeautifulSoup
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    try:              
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remover espaços em branco e quebras de linha desnecessárias
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # Regex para identificar palavras
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')

        # Substituir as correspondências por elas mesmas precedidas por um espaço
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()

        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]

        # Aplicar lemmatizer
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

def process_extra_text(text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    from bs4 import BeautifulSoup
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    if pd.isna(text):
        return []

    try:              
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remover espaços em branco e quebras de linha desnecessárias
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # Regex para identificar palavras
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')

        # Substituir as correspondências por elas mesmas precedidas por um espaço
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()

        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]

        # Aplicar lemmatizer
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

In [None]:
from bs4 import BeautifulSoup
import re

def process_html_for_how_many_prices(text):
    try:              
        regex_precos = re.compile(r'\$|R\$')
        precos = regex_precos.findall(text)
        return len(precos)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for prices.\nError:\n' + str(e))

def process_html_for_how_many_values(text):
    try:              
        regex_valores = re.compile(r'\d+(?:\.\d{3})*(?:,\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?')
        valores = regex_valores.findall(text)
        return len(valores)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for values.\nError:\n' + str(e))

def get_html_body(html_str):
    try:
        # Tentar usar diferentes parsers
        for parser in ['html.parser', 'html5lib', 'lxml']:
            try:
                soup = BeautifulSoup(html_str, parser)
                text = soup.body.get_text() if soup.body else ''
                return text
            except Exception as parser_e:
                continue
        
    except Exception as e:
        return ''

In [None]:
def only_number(text):
    text = re.sub(r'[^\d]', '', text)
    return text

def remove_invalid_company(company_id):
    company_id = re.sub(r'(\d)\1{12}', '', company_id)
    if len(company_id) == 14:
        return company_id
    return None 

def order_by_common(data):
    from collections import Counter
    data_output = Counter(data)
    return [k for k, v in data_output.most_common()]

def extract_and_process_cnpjs(text):
    pattern = re.compile(r'\d{2}\.\d{3}\.\d{3}[\/ ]\d{4}[- ]\d{2}')
    matches = pattern.findall(text)
    processed_matches = []
    for match in matches:
        cleaned = only_number(match)
        valid_company = remove_invalid_company(cleaned)
        if valid_company:
            processed_matches.append(valid_company)
    return processed_matches

In [None]:
def generate_features(dataframe):
    try:
        dataframe = check_integrity(dataframe)

        lem_dict = build_lemmatizer_pt_dict()

        dataframe.loc[:, 'html_about'] = dataframe.loc[:,'about'].apply(lambda x: process_extra_text(x, lem_dict))
        dataframe.loc[:, 'html_slogan'] = dataframe.loc[:,'slogan'].apply(lambda x: process_extra_text(x, lem_dict))
        
        html_body = dataframe.loc[:,'html'].apply(get_html_body)
        dataframe.loc[:, 'html_tokens'] = html_body.apply(lambda x: process_html_for_vectorizer(x, lem_dict))

        dataframe = dataframe.drop(columns=['about', 'slogan', 'html'])
        dataframe['tokens'] = dataframe[['html_about', 'html_slogan', 'html_tokens']].sum(axis=1)
        
        return dataframe
    except Exception as e:
        raise Exception('An error occured while trying to generate features.\nError:\n' + str(e))

In [None]:
def model_report(score, confusion_matrix, classification_report, model_card, classes):
    
    # Gera o heatmap da confusion matrix
    plt.figure(figsize=(8, 5))
    sns.heatmap(confusion_matrix, 
                annot=True, 
                fmt="d", 
                linewidths=.5, 
                square = True, 
                cmap = 'Blues', 
                annot_kws={"size": 16}, 
                xticklabels=classes, 
                yticklabels=classes)

    plt.xticks(rotation='horizontal', fontsize=16)
    plt.yticks(rotation='horizontal', fontsize=16)
    plt.xlabel('Predicted Label', size=20)
    plt.ylabel('Actual Label', size=20)

    title = 'Accuracy Score: {0:.4f}'.format(score)
    plt.title(title, size = 20)

    # Mostra o classification report e o heatmap
    pprint(classification_report)
    plt.show()

    model_card['accuracy_best'] = round(classification_report['accuracy'], 4)
    model_card['precision_macro_best'] = round(classification_report['macro avg']['precision'], 4)
    model_card['recall_macro_best'] = round(classification_report['macro avg']['recall'], 4)
    model_card['f1_macro_best'] = round(classification_report['macro avg']['f1-score'], 4)
    model_card['support_0_best'] = classification_report['0']['support']
    model_card['support_1_best'] = classification_report['1']['support']
    model_card['support_2_best'] = classification_report['2']['support']

    return model_card

In [None]:
def cross_validate_report(cross_validate_results, model_card):

    # métricas dos modelos gerados no cross validation
    print('accuracy:\t', cross_validate_results['test_accuracy'], ' \tmean: ', cross_validate_results['test_accuracy'].mean())
    print('precision:\t', cross_validate_results['test_precision'], ' \tmean: ', cross_validate_results['test_precision'].mean())
    print('recall:\t\t', cross_validate_results['test_recall'], ' \tmean: ', cross_validate_results['test_recall'].mean())
    print('f1:\t\t', cross_validate_results['test_f1'], ' \tmean: ', cross_validate_results['test_f1'].mean())
    print('fit_time:\t', cross_validate_results['fit_time'], ' \tmean: ', cross_validate_results['fit_time'].mean())
    print('score_time:\t', cross_validate_results['score_time'], ' \tmean: ', cross_validate_results['score_time'].mean())

    max_f1_pos = list(cross_validate_results['test_f1']).index(max(cross_validate_results['test_f1']))
    best_estimator = cross_validate_results['estimator'][max_f1_pos]
    best_indices = {
        'train': cross_validate_results['indices']['train'][max_f1_pos],
        'test': cross_validate_results['indices']['test'][max_f1_pos]
    }

    model_card['accuracy_mean'] = round(cross_validate_results['test_accuracy'].mean(), 4)
    model_card['precision_mean'] = round(cross_validate_results['test_precision'].mean(), 4)
    model_card['recall_mean'] = round(cross_validate_results['test_recall'].mean(), 4)
    model_card['f1_mean'] = round(cross_validate_results['test_f1'].mean(), 4)
    model_card['fit_time_mean'] = round(cross_validate_results['fit_time'].mean(), 4)
    model_card['score_time_mean'] = round(cross_validate_results['score_time'].mean(), 4)

    
    return best_estimator, best_indices, model_card

In [None]:
def save_vectorizer_model(vectorizer, model, encoder, model_card):
    try:
        file_name = model_card['scope'] + '_' + model_card['vectorizer'] + '_' + model_card['model'] + '_' + \
            str(model_card['word_reduction']) + '_' + str(model_card['kfold_splits']) + '_' + str(model_card['kfold_shuffle']) + '_' + \
                str(model_card['kfold_random_state']) + '_' + str(model_card['vectorizer_max_features']) + '_' + model_card['dataset']

        model_card['vectorizer_file_name'] = 'VECTORIZER_v1_' + file_name + '.pkl'
        model_card['encoder_file_name'] = 'ENCODER_v1_' + file_name + '.pkl'
        model_card['model_file_name'] = 'MODEL_v1_' + file_name + '.pkl'

        with open('../models/' + model_card['vectorizer_file_name'], 'wb') as file:
            pickle.dump(vectorizer, file)

        with open('../models/' + model_card['model_file_name'], 'wb') as file:
            pickle.dump(model, file)
        
        with open('../models/' + model_card['encoder_file_name'], 'wb') as file:
            pickle.dump(encoder, file)

    except Exception as e:
        print('An error ocurred while trying to save the model. Error: ' + str(e))
        print(model_card)

    return model_card

In [None]:
model_card = {
    'scope': 'iugu',
    'vectorizer': '',
    'model': '',
    'word_reduction': 'custom_lemmatizer', 
    'kfold_splits': 3,
    'kfold_shuffle': True,
    'kfold_random_state': 42,
    'vectorizer_max_features': 1000, # None ou INTeger -> testado: None, 50, 100, 200, 500, 1000, 1500, 2000
    'dataset': 'base_iugu', 
    'accuracy_mean': '',
    'precision_mean': '',
    'recall_mean': '',
    'f1_mean': '',
    'fit_time_mean': '',
    'score_time_mean': '',
    'accuracy_best': '',
    'precision_macro_best': '',
    'recall_macro_best': '',
    'f1_macro_best': '',
    'support_0_best': '',
    'support_1_best': '',
    'support_2_best': '',
    'vectorizer_file_name': '',
    'model_file_name': '',
    'encoder_file_name': '',
}

# metricas utilizadas pela validação cruzada
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score, normalize=True), 
    'precision': make_scorer(precision_score, average='macro', zero_division=0), 
    'recall': make_scorer(recall_score, average='macro'), 
    'f1': make_scorer(f1_score, average='macro')
}

# CV splitter com StratifiedKFold, para manter proporções de exemplos de cada classe target
stratified_kfold = StratifiedKFold(n_splits=model_card['kfold_splits'], shuffle=model_card['kfold_shuffle'], random_state=model_card['kfold_random_state'])

# Train and Evaluate Model

In [None]:
df = generate_features(df)
df = df.reset_index(drop=True)
df.head(30)

In [None]:
# excluir os que não estão pegando os tokens
df['len_tokens'] = df['tokens'].apply(len)
df = df[df['len_tokens'] > 0].reset_index(drop=True)
df = df.drop(columns=['len_tokens'])
df.head(30)

In [None]:
df = pd.concat([df, pd.get_dummies(df['field']).astype(int)], axis=1)
df = df.drop(columns=['field'])
df

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.head(30)

In [None]:
# Multinomial Naive-Bayes com TF-IDF
mnb_tfidf_model_card = model_card.copy()
mnb_tfidf_model_card['vectorizer'] = 'tfidf_vectorizer'
mnb_tfidf_model_card['model'] = 'multinomial_nb'

cv_tfidf_vectorizer = TfidfVectorizer(max_features=model_card['vectorizer_max_features'])
token_strings = [' '.join(doc) for doc in df['tokens']]
tfidf_matrix = cv_tfidf_vectorizer.fit_transform(token_strings)

In [None]:
idf_values = cv_tfidf_vectorizer.idf_
feature_names = cv_tfidf_vectorizer.get_feature_names_out()
feature_importances = pd.DataFrame({'feature': feature_names, 'idf': idf_values})
feature_importances = feature_importances.sort_values(by='idf', ascending=False)
feature_importances

In [None]:
le = LabelEncoder()

# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=cv_tfidf_vectorizer.get_feature_names_out())

features = ['about_is_null', 'slogan_is_null', 'field_is_null']

# Concatenar os dataframes de features
other_features = df[features]
features_df = pd.concat([other_features, tfidf_df], axis=1)

df_y_original = df["segment"].copy()

df["segment"] = le.fit_transform(df["segment"])
df_y = df["segment"]

features_df

In [None]:
df_y

In [None]:
df_y_original

In [None]:
list(le.classes_)

In [None]:
clf = MultinomialNB()
# clf = KNeighborsClassifier(n_neighbors=3)

# cross validation
cv_results = cross_validate(estimator=clf, X=features_df, y=df_y, 
                            cv=stratified_kfold, scoring=scoring_metrics,
                            return_estimator=True, return_indices=True
                            )

In [None]:
# cross validate report
best_estimator, best_indices, mnb_tfidf_model_card = cross_validate_report(cv_results, mnb_tfidf_model_card)

In [None]:
# best model report
best_indices_test_X = features_df.iloc[best_indices['test']]
best_indices_test_Y = df_y.iloc[best_indices['test']]

best_estimator_predictions = list(best_estimator.predict(best_indices_test_X))
best_indices_test_Y = best_indices_test_Y.values.tolist()

# best_estimator_score = best_estimator.score(X=best_indices_test_X, y=best_indices_test_Y)
best_estimator_score = accuracy_score(y_true=best_indices_test_Y, y_pred=best_estimator_predictions)
best_estimator_score_f1 = f1_score(y_true=best_indices_test_Y, y_pred=best_estimator_predictions, pos_label=0, average="weighted")

best_estimator_cmatrix = confusion_matrix(y_pred=best_estimator_predictions, y_true=best_indices_test_Y)
best_estimator_creport = classification_report(y_pred=best_estimator_predictions, y_true=best_indices_test_Y, zero_division=0, output_dict=True)

mnb_tfidf_model_card = model_report(best_estimator_score, best_estimator_cmatrix, best_estimator_creport, mnb_tfidf_model_card, list(le.classes_))

# save model
mnb_tfidf_model_card = save_vectorizer_model(cv_tfidf_vectorizer, clf, le, mnb_tfidf_model_card)

print(f"F1-Score: {round(best_estimator_score_f1, 4)}")
print(mnb_tfidf_model_card)

# Train full model

In [None]:
clf = MultinomialNB()
# clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(features_df, df_y)
mnb_tfidf_model_card = save_vectorizer_model(cv_tfidf_vectorizer, clf, le, mnb_tfidf_model_card)

# Test

In [None]:
test_df = pd.read_parquet("../data/iugu_enrichment.parquet", engine="pyarrow")
test_df["html"] = test_df["html"].astype(str)
test_df["raiz_cnpj"] = test_df["raiz_cnpj"].astype(int)
test_df.head(5)

In [None]:
test_df = test_df.merge(linkedin_normalizado, on="raiz_cnpj", how="left")
test_df = test_df[["url_x", "host", "html", "raiz_cnpj", "cnpj", "Nome da empresa", "url_y", "sobre", "slogan", "area_atuacao"]]
test_df = test_df.rename(columns={"url_x": "url", "url_y": "linkedin_url"})
test_df

In [None]:
test_df.shape

In [None]:
test_df["host"].nunique()

In [None]:
test_df["url"].nunique()

In [None]:
test_df.isnull().sum()

In [None]:
test_df = test_df[["host", "url", "html", "sobre", "slogan", "area_atuacao"]]
test_df = test_df.rename(columns={"Segmento iugu": "segment", "sobre": "about", "area_atuacao": "field"})
test_df.head()

In [None]:
test_df["about_is_null"] = test_df["about"].apply(lambda x: int(pd.isna(x)))
test_df["slogan_is_null"] = test_df["slogan"].apply(lambda x: int(pd.isna(x)))
test_df["field_is_null"] = test_df["field"].apply(lambda x: int(pd.isna(x)))
test_df

In [None]:
test_df = generate_features(test_df)
test_df = test_df.reset_index(drop=True)
test_df.head(40)

In [None]:
# excluir os que não estão pegando os tokens
test_df['len_tokens'] = test_df['tokens'].apply(len)
test_df = test_df[test_df['len_tokens'] > 0].reset_index(drop=True)
test_df = test_df.drop(columns=['len_tokens'])
test_df.head(30)

In [None]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['field']).astype(int)], axis=1)
test_df = test_df.drop(columns=['field'])
test_df

In [None]:
model_path = "../models/MODEL_v1_iugu_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_base_iugu.pkl"
vectorizer_path = "../models/VECTORIZER_v1_iugu_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_base_iugu.pkl"
encoder_path = "../models/ENCODER_v1_iugu_tfidf_vectorizer_multinomial_nb_custom_lemmatizer_3_True_42_1000_base_iugu.pkl"

with open(vectorizer_path, "rb") as f:
    vectorizer = pickle.load(f)

with open(model_path, "rb") as f:
    model = pickle.load(f)

with open(encoder_path, "rb") as f:
    encoder = pickle.load(f)

In [None]:
token_strings = [' '.join(doc) for doc in test_df['tokens']]
tfidf_matrix = vectorizer.transform(token_strings)

# Converter a matriz TF-IDF em um dataframe pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

features = ['about_is_null', 'slogan_is_null', 'field_is_null']

# Concatenar os dataframes de features
other_features = test_df[features]
features_df = pd.concat([other_features, tfidf_df], axis=1)
features_df

In [None]:
test_df["prediction"] = list(best_estimator.predict(features_df))
test_df["prediction"] = test_df["prediction"].apply(lambda x: encoder.inverse_transform([x])[0])
test_df