In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import torch

In [None]:
entidades = pd.read_csv("../data/bronze/entidades_proyecto.csv", sep=';')
# Se suprime la variable ID para la cosntrucción de modelos debido a que no se sabe si la etiqueta está bien asignada o no
entidades = entidades.drop(columns=['ID'])

# Dado que los códigos se generan para cada registro, no se tomarán para el modelo debido a que estos se generan unicamente después del registro en alguna de las plataformas y el objetivo del proyecto
# es desarrollar el mecanismo para que se valide si ya existe antes de que sea registrado
entidades = entidades.drop(columns=['CODIGO'])

entidades['NOMBRE'] = entidades['NOMBRE'].astype(str)
entidades['NIT'] = entidades['NIT'].astype(str)

entidades.head(2)

In [None]:
# Se crearán 10 conjuntos de entrenamiento
train_sets = {}

for i in range(10):
    train_sets[i] = entidades.sample(frac=0.7, replace=True, random_state=i)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Se implementará un modelo preentrenado multilenguaje para la creación de los vectores de los registros tanto de nombres como de nits
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
input_names_sets = {}
input_nits_sets = {}

for train_set in train_sets:
    input_names_sets[train_set] = tokenizer(train_sets[train_set]['NOMBRE'].to_list(), padding=True, truncation=True, return_tensors="pt")
    input_nits_sets[train_set] = tokenizer(train_sets[train_set]['NIT'].to_list(), padding=True, truncation=True, return_tensors="pt")

names_embebings_sets = {}
nits_embedings_sets = {}

with torch.no_grad():
    # Nombres
    for input_set in input_names_sets:
        names_embebings_sets[input_set] = model(**input_names_sets[input_set]).logits.numpy()
    
    # Nits
    for input_set in input_nits_sets:
        nits_embedings_sets[input_set] = model(**input_nits_sets[input_set]).logits.numpy()

In [None]:
# Estandarización de los vectores
names_embebings_sets_standardized = {}
nits_embebings_sets_standardized = {}

scaler_names = StandardScaler()
for names_embeging_set in names_embebings_sets:
    names_embebings_sets_standardized[names_embeging_set] = scaler_names.fit_transform(names_embebings_sets[names_embeging_set])

scaler_nits = StandardScaler()
for nits_embeging_set in nits_embedings_sets:
    nits_embebings_sets_standardized[nits_embeging_set] = scaler_nits.fit_transform(nits_embedings_sets[nits_embeging_set])

In [None]:
print('names_embebings_sets_standardized:', names_embebings_sets_standardized[0].shape)
print('nits_embebings_sets_standardized:', nits_embebings_sets_standardized[0].shape)

entidades_embebidas = {}
for train_set in train_sets:
    names_embebings_sets_standardized[train_set] = names_embebings_sets_standardized[train_set].astype(float)
    nits_embebings_sets_standardized[train_set] = nits_embebings_sets_standardized[train_set].astype(float)
    entidades_embebidas[train_set] = np.concatenate((names_embebings_sets_standardized[train_set], nits_embebings_sets_standardized[train_set]), axis=1)

print('entidades_embebidas:', entidades_embebidas[0].shape)

In [None]:
# Suponemos que estos son tus cuatro DataFrames
dataframes_list = [[names_embebings_sets_standardized[x], nits_embebings_sets_standardized[x], entidades_embebidas[x]] for x in train_sets.keys()]

plt.figure(figsize=(20, 5))

for dataframe_list in dataframes_list:
    for i, df in enumerate(dataframe_list, 1):
        pca = PCA().fit(df)
        plt.subplot(1, 3, i)
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel('Número de Componentes')
        plt.ylabel('Varianza Explicada')
        plt.title(f'Análisis de Varianza Explicada para DataFrame {i}')

Se puede observar que para todos los data sets de entrenamiento se logra capturar un 95% de la varianza en el caso de los nombres y los nits por separado, mientras que en el caso de los datasets que resultan de haber unido las vectorizaciones de nombres y nits, es necesario contar con 3 componentes principales para alcanzar el punto de codo. 

In [None]:
names_embebings_sets_pca = {}
nits_embebings_sets_pca = {}
entidades_embebidas_pca = {}

names_pca = PCA(n_components=2)
nits_pca = PCA(n_components=2)
entidades_pca = PCA(n_components=3)

for train_set in train_sets:
    names_embebings_sets_pca[train_set] = pd.DataFrame(names_pca.fit_transform(names_embebings_sets_standardized[train_set]))
    for col in names_embebings_sets_pca[train_set].columns:
        names_embebings_sets_pca[train_set][col] = names_embebings_sets_pca[train_set][col].astype(float)

    nits_embebings_sets_pca[train_set] = pd.DataFrame(nits_pca.fit_transform(nits_embebings_sets_standardized[train_set]))
    for col in nits_embebings_sets_pca[train_set].columns:
        nits_embebings_sets_pca[train_set][col] = nits_embebings_sets_pca[train_set][col].astype(float)

    entidades_embebidas_pca[train_set] = pd.DataFrame(entidades_pca.fit_transform(entidades_embebidas[train_set]))
    for col in entidades_embebidas_pca[train_set].columns:
        entidades_embebidas_pca[train_set][col] = entidades_embebidas_pca[train_set][col].astype(float)


In [None]:
Nc = range(2, 25)  # El score de silueta no se puede calcular para Nc = 1

# Configuramos el tamaño de la figura para visualizar todas las gráficas adecuadamente
plt.figure(figsize=(30, 25))

# Iterar sobre las llaves de los diccionarios
for i, key in enumerate(names_embebings_sets_pca.keys(), 1):
    dataframes = [names_embebings_sets_pca[key], nits_embebings_sets_pca[key], entidades_embebidas_pca[key]]
    
    # Iterar sobre cada DataFrame y aplicar PCA
    for j, df in enumerate(dataframes):
        scores = []
        sil_scores = []
        kmeans = [KMeans(n_clusters=num_clusters) for num_clusters in Nc]
        
        for model in kmeans:
            model.fit(df)
            scores.append(-model.score(df))  # Usamos el negativo del score porque KMeans minimiza la inercia
            labels = model.labels_
            sil_score = silhouette_score(df, labels)
            sil_scores.append(sil_score)
        
        # Crear un subplot para cada DataFrame
        ax = plt.subplot(len(names_embebings_sets_pca), 3, (i-1)*3 + j + 1)
        ax.plot(Nc, scores, marker='o', linestyle='--', color='blue', label='Elbow Score')
        ax.set_xlabel('Número de Clusters')
        ax.set_ylabel('Inertia', color='blue')
        ax.tick_params(axis='y', labelcolor='blue')
        ax.set_title(f'Elbow Curve y Silhouette Score para {key} DataFrame {j+1}')

        # Crear eje y secundario para los scores de silueta
        ax2 = ax.twinx()
        ax2.plot(Nc, sil_scores, marker='o', linestyle='--', color='red', label='Silhouette Score')
        ax2.set_ylabel('Silhouette Score', color='red')
        ax2.tick_params(axis='y', labelcolor='red')

        # Añadir leyendas
        ax.legend(loc='upper left')
        ax2.legend(loc='upper right')

# Mostrar todas las gráficas
plt.tight_layout()
plt.show()

In [None]:
# Configuramos el tamaño de la figura para visualizar todas las gráficas adecuadamente
plt.figure(figsize=(30, 25))

# Iterar sobre las llaves de los diccionarios
for i, key in enumerate(names_embebings_sets_pca.keys(), 1):
    dataframes = [names_embebings_sets_pca[key], nits_embebings_sets_pca[key], entidades_embebidas_pca[key]]
    
    # Iterar sobre cada DataFrame y aplicar PCA
    for j, df in enumerate(dataframes):
        scores = []
        sil_scores = []
        gmm_list = [GaussianMixture(n_components=num_clusters, random_state=0) for num_clusters in Nc]
        
        for gmm in gmm_list:
            gmm.fit(df)
            labels = gmm.predict(df)
            scores.append(-gmm.score(df) * len(df))  # Usamos el negativo del score porque GaussianMixture maximiza el log likelihood
            sil_score = silhouette_score(df, labels)
            sil_scores.append(sil_score)
        
        # Crear un subplot para cada DataFrame
        ax = plt.subplot(len(names_embebings_sets_pca), 3, (i-1)*3 + j + 1)
        ax.plot(Nc, scores, marker='o', linestyle='--', color='blue', label='Elbow Score')
        ax.set_xlabel('Número de Clusters')
        ax.set_ylabel('Inertia', color='blue')
        ax.tick_params(axis='y', labelcolor='blue')
        ax.set_title(f'Elbow Curve y Silhouette Score para {key} DataFrame {j+1}')

        # Crear eje y secundario para los scores de silueta
        ax2 = ax.twinx()
        ax2.plot(Nc, sil_scores, marker='o', linestyle='--', color='red', label='Silhouette Score')
        ax2.set_ylabel('Silhouette Score', color='red')
        ax2.tick_params(axis='y', labelcolor='red')

        # Añadir leyendas
        ax.legend(loc='upper left')
        ax2.legend(loc='upper right')

# Mostrar todas las gráficas
plt.tight_layout()
plt.show()

In [None]:
from sklearn.cluster import HDBSCAN

In [None]:
dataframe_names = ['Nombres', 'NITs', 'Entidades']

# Definir rangos para los parámetros de HDBSCAN
min_cluster_sizes = range(4, 11)
min_samples_list = range(1, 6)

# Inicializar DataFrames para almacenar la suma de los resultados por columna
average_results_per_column = [pd.DataFrame(index=min_cluster_sizes, columns=min_samples_list, dtype=float).fillna(0) for _ in range(len(dataframe_names))]
count_per_column = [0] * len(dataframe_names)

# Preparar figura para los heatmaps
fig, axes = plt.subplots(len(names_embebings_sets_pca) + 1, len(dataframe_names), figsize=(25, 35))  # +1 fila para el heatmap promedio

# Iterar sobre las llaves de los diccionarios
for i, key in enumerate(names_embebings_sets_pca.keys()):
    dataframes = [names_embebings_sets_pca[key], nits_embebings_sets_pca[key], entidades_embebidas_pca[key]]
    
    # Iterar sobre cada DataFrame
    for j, (df, name) in enumerate(zip(dataframes, dataframe_names)):
        results = pd.DataFrame(index=min_cluster_sizes, columns=min_samples_list, dtype=float)

        for min_cluster_size in min_cluster_sizes:
            for min_samples in min_samples_list:
                # Configurar HDBSCAN
                clusterer = HDBSCAN(min_cluster_size=min_cluster_size, 
                                    min_samples=min_samples, 
                                    metric='manhattan', 
                                    cluster_selection_method='eom')
                labels = clusterer.fit_predict(df)
                
                # Calcular Silhouette Score solo si se forman al menos 2 clusters (excluyendo el ruido)
                if len(set(labels)) > 1:
                    sil_score = silhouette_score(df, labels)
                else:
                    sil_score = -1  # Indicativo de un resultado no útil o no clusterizado
                
                results.at[min_cluster_size, min_samples] = sil_score
        
        # Asegurarse de que todos los valores sean float
        results = results.astype(float)

        # Acumular los resultados para el promedio por columna
        average_results_per_column[j] += results
        count_per_column[j] += 1

        # Mostrar heatmap para cada DataFrame
        sns.heatmap(results, annot=True, ax=axes[i, j], cmap='viridis', fmt=".2f")
        axes[i, j].set_title(f'Silhouette Scores Heatmap for {key} - {name}')
        axes[i, j].set_xlabel('Min Samples')
        axes[i, j].set_ylabel('Min Cluster Size')

# Calcular el promedio de los resultados por columna
for j, name in enumerate(dataframe_names):
    average_results_per_column[j] /= count_per_column[j]
    sns.heatmap(average_results_per_column[j], annot=True, ax=axes[-1, j], cmap='viridis', fmt=".2f")
    axes[-1, j].set_title(f'Average Silhouette Scores Heatmap for {name}')
    axes[-1, j].set_xlabel('Min Samples')
    axes[-1, j].set_ylabel('Min Cluster Size')

# Ajustar el layout
plt.tight_layout()
plt.show()

Para establecer la cantidad de clusters usaremos las mejores dos puntuaciones de silueta evaluadas en los tres tipos de conjuntos de datos sobre los cuales estamos especializando a nuestros modelos

In [None]:
lens = []
for i, key in enumerate(names_embebings_sets_pca.keys()):
    dataframes = [names_embebings_sets_pca[key]] # , nits_embebings_sets_pca[key], entidades_embebidas_pca[key]
    
    # Iterar sobre cada DataFrame
    for j, (df, name) in enumerate(zip(dataframes, dataframe_names)):
        results = pd.DataFrame(index=min_cluster_sizes, columns=min_samples_list, dtype=float)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=3, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=6, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=6, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

lens = pd.DataFrame(lens)
print('Cantidad de clusters para nombres:', lens.mean().values[0])

lens = []
for i, key in enumerate(names_embebings_sets_pca.keys()):
    dataframes = [nits_embebings_sets_pca[key]] # , nits_embebings_sets_pca[key], entidades_embebidas_pca[key]
    
    # Iterar sobre cada DataFrame
    for j, (df, name) in enumerate(zip(dataframes, dataframe_names)):
        results = pd.DataFrame(index=min_cluster_sizes, columns=min_samples_list, dtype=float)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=3, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=4, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=5, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=5, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=5, min_samples=3, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=5, min_samples=4, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

lens = pd.DataFrame(lens)
print('Cantidad de clusters para nits:', lens.mean().values[0])

lens = []
for i, key in enumerate(names_embebings_sets_pca.keys()):
    dataframes = [entidades_embebidas_pca[key]] # , nits_embebings_sets_pca[key], entidades_embebidas_pca[key]
    
    # Iterar sobre cada DataFrame
    for j, (df, name) in enumerate(zip(dataframes, dataframe_names)):
        results = pd.DataFrame(index=min_cluster_sizes, columns=min_samples_list, dtype=float)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=4, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=7, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=7, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=8, min_samples=1, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

        clusterer = HDBSCAN(min_cluster_size=8, min_samples=2, metric='manhattan', cluster_selection_method='eom')
        labels = clusterer.fit_predict(df)
        lens.append(len(list(set(labels))) - 1)

lens = pd.DataFrame(lens)
print('Cantidad de clusters para nombres y nits:', lens.mean().values[0])

Con la información anterior, crearemos modelos que se especialicen en la información que va diligenciando el usuario. Esto significa que si el usuario solo diligencia el nombre, se usará solo el dataset de nombres y de igual forma si diligencia solo el nit o si diligencia el nombre y el nit.

In [None]:
names_datasets_clusterized = []

for i, key in enumerate(names_embebings_sets_pca.keys(), 1):
    dataframes = [names_embebings_sets_pca[key]]
    for j, df in enumerate(dataframes):
        # Asegurarse de que todos los nombres de columnas sean cadenas
        df.columns = df.columns.astype(str)
        
        # Kmeans
        model_km = KMeans(n_clusters=23)
        model_km.fit(df)
        df['cluster'] = model_km.labels_
        names_datasets_clusterized.append(df.copy())
        df = df.drop(columns=['cluster'])

        # GaussianMixture
        gmm = GaussianMixture(n_components=23, random_state=0)
        gmm.fit(df)
        df['cluster'] = gmm.predict(df)
        names_datasets_clusterized.append(df.copy())
        df = df.drop(columns=['cluster'])

        # HDBSCAN
        for min_cluster_size in min_cluster_sizes:
            for min_samples in min_samples_list:
                # Configurar HDBSCAN
                clusterer = HDBSCAN(min_cluster_size=min_cluster_size, 
                                    min_samples=min_samples, 
                                    metric='manhattan', 
                                    cluster_selection_method='eom')
                df['cluster'] = clusterer.fit_predict(df)
                names_datasets_clusterized.append(df.copy())
                df = df.drop(columns=['cluster'])

In [None]:
nits_datasets_clusterized = []

for i, key in enumerate(nits_embebings_sets_pca.keys(), 1):
    dataframes = [nits_embebings_sets_pca[key]]
    for j, df in enumerate(dataframes):
        # Asegurarse de que todos los nombres de columnas sean cadenas
        df.columns = df.columns.astype(str)
        
        # Kmeans
        model_km = KMeans(n_clusters=18)
        model_km.fit(df)
        df['cluster'] = model_km.labels_
        nits_datasets_clusterized.append(df.copy())
        df = df.drop(columns=['cluster'])

        # GaussianMixture
        gmm = GaussianMixture(n_components=18, random_state=0)
        gmm.fit(df)
        df['cluster'] = gmm.predict(df)
        nits_datasets_clusterized.append(df.copy())
        df = df.drop(columns=['cluster'])

        # HDBSCAN
        for min_cluster_size in min_cluster_sizes:
            for min_samples in min_samples_list:
                # Configurar HDBSCAN
                clusterer = HDBSCAN(min_cluster_size=min_cluster_size, 
                                    min_samples=min_samples, 
                                    metric='manhattan', 
                                    cluster_selection_method='eom')
                df['cluster'] = clusterer.fit_predict(df)
                nits_datasets_clusterized.append(df.copy())
                df = df.drop(columns=['cluster'])

In [None]:
entities_datasets_clusterized = []

for i, key in enumerate(entidades_embebidas_pca.keys(), 1):
    dataframes = [entidades_embebidas_pca[key]]
    for j, df in enumerate(dataframes):
        # Asegurarse de que todos los nombres de columnas sean cadenas
        df.columns = df.columns.astype(str)
        
        # Kmeans
        model_km = KMeans(n_clusters=15)
        model_km.fit(df)
        df['cluster'] = model_km.labels_
        entities_datasets_clusterized.append(df.copy())
        df = df.drop(columns=['cluster'])

        # GaussianMixture
        gmm = GaussianMixture(n_components=15, random_state=0)
        gmm.fit(df)
        df['cluster'] = gmm.predict(df)
        entities_datasets_clusterized.append(df.copy())
        df = df.drop(columns=['cluster'])

        # HDBSCAN
        for min_cluster_size in min_cluster_sizes:
            for min_samples in min_samples_list:
                # Configurar HDBSCAN
                clusterer = HDBSCAN(min_cluster_size=min_cluster_size, 
                                    min_samples=min_samples, 
                                    metric='manhattan', 
                                    cluster_selection_method='eom')
                df['cluster'] = clusterer.fit_predict(df)
                entities_datasets_clusterized.append(df.copy())
                df = df.drop(columns=['cluster'])

# Implementación de arboles para clasificación

Dado que cada uno de los datasets que hemos creado podría o no, capturar el fenómeno que queremos controlar. Crearemos un arbol de desición para clasificación sobreentrenado sobre cada uno de los tres tipos de conjuntos de datos, para predecir el comportamiento de manera similar a como se genera usando bagging: 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Arboles expertos en nombres
names_decision_trees = []

for df in names_datasets_clusterized:
    tree = DecisionTreeClassifier()
    tree.fit(df[df.columns[:-1]], df['cluster'])
    names_decision_trees.append(tree)

# Arboles expertos en nits
nits_decision_trees = []

for df in nits_datasets_clusterized:
    tree = DecisionTreeClassifier()
    tree.fit(df[df.columns[:-1]], df['cluster'])
    nits_decision_trees.append(tree)

# Arboles expertos en entidades (nombres y nits)
entities_decision_trees = []

for df in entities_datasets_clusterized:
    tree = DecisionTreeClassifier()
    tree.fit(df[df.columns[:-1]], df['cluster'])
    entities_decision_trees.append(tree)

In [None]:
from scipy.stats import mode

def predict_with_names_trees(new_data, names_decision_trees=names_decision_trees):
    predictions = []
    for tree in names_decision_trees:
        pred = tree.predict(new_data)
        if pred[0] != -1:
            predictions.append(pred[0])
    return mode(predictions).mode

def predict_with_nits_trees(new_data, nits_decision_trees=nits_decision_trees):
    predictions = []
    for tree in nits_decision_trees:
        pred = tree.predict(new_data)
        if pred[0] != -1:
            predictions.append(pred[0])
    return mode(predictions).mode

def predict_with_entities_trees(new_data, entities_decision_trees=entities_decision_trees):
    predictions = []
    for tree in entities_decision_trees:
        pred = tree.predict(new_data)
        if pred[0] != -1:
            predictions.append(pred[0])
    return mode(predictions).mode

In [None]:
nombres_a_revisar = nombres_entidades_prueba = pd.DataFrame(names_pca.transform(scaler_names.transform(model(**tokenizer(entidades['NOMBRE'].to_list(), padding=True, truncation=True, return_tensors="pt")).logits.detach().numpy()).astype(float)))

for i, row in nombres_a_revisar.iterrows():
    # Convertir la fila a un DataFrame de una sola fila
    single_row_df = row.to_frame().T
    nombres_a_revisar.at[i, 'grupo'] = predict_with_names_trees(single_row_df)

entidades_copy = entidades.copy()
entidades['grupo'] = nombres_a_revisar['grupo']
entidades.to_csv('../data/gold/entidades_agrupadas_por_nombres.csv', index=False)

In [None]:
nits_a_revisar = pd.DataFrame(nits_pca.transform(scaler_nits.transform(model(**tokenizer(entidades['NIT'].to_list(), padding=True, truncation=True, return_tensors="pt")).logits.detach().numpy()).astype(float)))

for i, row in nits_a_revisar.iterrows():
    # Convertir la fila a un DataFrame de una sola fila
    single_row_df = row.to_frame().T
    nits_a_revisar.at[i, 'grupo'] = predict_with_nits_trees(single_row_df)

entidades_copy = entidades.copy()
entidades['grupo'] = nits_a_revisar['grupo']
entidades.to_csv('../data/gold/entidades_agrupadas_por_nits.csv', index=False)

In [None]:
entidades_a_revisar = pd.DataFrame(entidades_pca.transform(np.concatenate((scaler_names.transform(model(**tokenizer(entidades['NOMBRE'].to_list(), padding=True, truncation=True, return_tensors="pt")).logits.detach().numpy()).astype(float), 
                            scaler_nits.transform(model(**tokenizer(entidades['NIT'].to_list(), padding=True, truncation=True, return_tensors="pt")).logits.detach().numpy()).astype(float)), axis=1)))

for i, row in entidades_a_revisar.iterrows():
    # Convertir la fila a un DataFrame de una sola fila
    single_row_df = row.to_frame().T
    entidades_a_revisar.at[i, 'grupo'] = predict_with_entities_trees(single_row_df)

entidades_copy = entidades.copy()
entidades['grupo'] = entidades_a_revisar['grupo']
entidades.to_csv('../data/gold/entidades_agrupadas_por_nombres_y_nits.csv', index=False)

In [None]:
val = pd.DataFrame(names_pca.transform(scaler_names.transform(model(**tokenizer(['sena'], padding=True, truncation=True, return_tensors="pt")).logits.detach().numpy()).astype(float)))
classe = predict_with_names_trees(val)

In [None]:
print(classe)

In [None]:
# Almacenamiento de modelos
import joblib

for i, tree in enumerate(names_decision_trees):
    joblib.dump(tree, f'../data/gold/model_weights/decision_trees/names/names_tree_{i}.joblib')

for i, tree in enumerate(nits_decision_trees):
    joblib.dump(tree, f'../data/gold/model_weights/decision_trees/nits/nits_tree_{i}.joblib')

for i, tree in enumerate(entities_decision_trees):
    joblib.dump(tree, f'../data/gold/model_weights/decision_trees/entities/entities_tree_{i}.joblib')

In [None]:
model.save_pretrained('../data/gold/model_weights/tokenizer')
tokenizer.save_pretrained('../data/gold/model_weights/tokenizer')

In [None]:
joblib.dump(scaler_names, '../data/gold/model_weights/scalers/scaler_names.joblib')
joblib.dump(scaler_nits, '../data/gold/model_weights/scalers/scaler_nits.joblib')

In [None]:
joblib.dump(names_pca, '../data/gold/model_weights/pca/names_pca.joblib')
joblib.dump(nits_pca, '../data/gold/model_weights/pca/nits_pca.joblib')
joblib.dump(entidades_pca, '../data/gold/model_weights/pca/entidades_pca.joblib')