In [1]:
import pandas as pd
import numpy as np
import random
import time
from scipy.stats import mode
import sys
import os

sys.path.append(os.path.join("..", ".."))

# Word Embedding
from sentence_transformers import SentenceTransformer

random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="../../conf"):
    cfg=compose(config_name="main.yaml")

# Word Embedding

## Word2Vec

In [46]:
# Data import
# Relative Path of the dataset, change for your dataset

dataset_name = "cpn27"
# Options are "cpn27" and "cpn120"

type_standardization = "lemmatize_wo_stop" 
folder_standardization = "lemmatize"
# options are "raw", "normalize", "normalize_wo_stop", "lemmatize", and "lemmatize_wo_stop"

# import of the data
data = pd.read_csv(cfg.path[type_standardization][dataset_name], delimiter=",")
data = data.fillna(value='')


In [47]:
import gensim
import spacy

# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    cfg.path_embedding.word2vec, # Relative path of the vector
    binary=True # The model is in binary format
) 

nlp = spacy.load(cfg.enviroment.nlp)

In [48]:
from functions.Vectorize import to_vector


# Classification Model
from functions.AC_PLT import AC_PLT
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data.shape[0], # the number of data points
        cfg.params.vector_length.word2vec  # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data.iloc[:,1]):
    descriptions_matrix[i,] = to_vector(description, model, cfg.params.vector_length.word2vec)

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)

# Remove of the 'Nan' values in the data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.word2vec]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)


accuracies_tops_test = np.zeros((5,3))
accuracies_tops_train = np.zeros((5,3))
tops = (1, 3, 5)


cross_validation = StratifiedKFold(n_splits = 5)

X = data_matrix[:, :cfg.params.vector_length.word2vec] 
y = data_matrix[:, cfg.params.vector_length.word2vec+2]

temp_test_acc = np.zeros(5)

for c, (train_index, test_index) in enumerate(cross_validation.split(X, y)):

    X_train = X[train_index, :]
    y_train = y[train_index]

    X_test = X[test_index, :]
    y_test = y[test_index]
    
    
    classification_model = AC_PLT(n_clusters=cfg.params.kmeans.n_cluster[dataset_name])

    classification_model.fit(X_train, y_train)

    pred_ranking_test = classification_model.suggestions(X_test, n_codes=5)
    pred_ranking_train = classification_model.suggestions(X_train, n_codes=5)

    top1_acc_test = np.zeros(len(y_test))
    top3_acc_test = np.zeros(len(y_test))
    top5_acc_test = np.zeros(len(y_test))

    
    for j in range(len(y_test)):
        top1_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[0]])
        top3_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[1]])
        top5_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[2]])


    accuracies_tops_test[c] = [np.mean(top1_acc_test), np.mean(top3_acc_test), np.mean(top5_acc_test)]



    top1_acc_train = np.zeros(len(y_train))
    top3_acc_train = np.zeros(len(y_train))
    top5_acc_train = np.zeros(len(y_train))

    
    for j in range(len(y_train)):
        top1_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[0]])
        top3_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[1]])
        top5_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[2]])
    
    accuracies_tops_train[c] = [np.mean(top1_acc_train), np.mean(top3_acc_train), np.mean(top5_acc_train)]



print(accuracies_tops_train.mean(axis=0))
print(accuracies_tops_test.mean(axis=0))

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_test.mean(axis=0), accuracies_tops_test.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/test/{folder_standardization}/word2vec_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_train.mean(axis=0), accuracies_tops_train.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/train/{folder_standardization}/word2vec_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

Execution time: 0.27165865898132324




[0.49584843 0.54728613 0.56267693]
[0.39590978 0.462333   0.48622949]


# Sentence Embedding

In [49]:
# Data import
# Relative Path of the dataset, change for your dataset

dataset_name = "cpn27"
# Options are "cpn27" and "cpn120"

type_standardization = "lemmatize" 
# options are "raw", "normalize", "normalize_wo_stop", and "lemmatize"

# import of the data
data = pd.read_csv(cfg.path[type_standardization][dataset_name], delimiter=",")
data = data.fillna(value='')

## LaBSE

In [50]:
model = SentenceTransformer('sentence-transformers/LaBSE')

# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),                                       # the number of data points
        cfg.params.vector_length.sentence_embedding      # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.sentence_embedding ]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)


# Classification Model
from functions.AC_PLT import AC_PLT
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold



accuracies_tops_test = np.zeros((5,3))
accuracies_tops_train = np.zeros((5,3))
tops = (1, 3, 5)


cross_validation = StratifiedKFold(n_splits = 5)

X = data_matrix[:, :cfg.params.vector_length.sentence_embedding] 
y = data_matrix[:, cfg.params.vector_length.sentence_embedding+2]

temp_test_acc = np.zeros(5)

for c, (train_index, test_index) in enumerate(cross_validation.split(X, y)):

    X_train = X[train_index, :]
    y_train = y[train_index]

    X_test = X[test_index, :]
    y_test = y[test_index]
    
    
    classification_model = AC_PLT(n_clusters=cfg.params.kmeans.n_cluster[dataset_name])

    classification_model.fit(X_train, y_train)

    pred_ranking_test = classification_model.suggestions(X_test, n_codes=5)

    top1_acc_test = np.zeros(len(y_test))
    top3_acc_test = np.zeros(len(y_test))
    top5_acc_test = np.zeros(len(y_test))

    
    for j in range(len(y_test)):
        top1_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[0]])
        top3_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[1]])
        top5_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[2]])


    accuracies_tops_test[c] = [np.mean(top1_acc_test), np.mean(top3_acc_test), np.mean(top5_acc_test)]


    pred_ranking_train = classification_model.suggestions(X_train, n_codes=5)

    top1_acc_train = np.zeros(len(y_train))
    top3_acc_train = np.zeros(len(y_train))
    top5_acc_train = np.zeros(len(y_train))

    
    for j in range(len(y_train)):
        top1_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[0]])
        top3_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[1]])
        top5_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[2]])
    
    accuracies_tops_train[c] = [np.mean(top1_acc_train), np.mean(top3_acc_train), np.mean(top5_acc_train)]



print(accuracies_tops_train.mean(axis=0))
print(accuracies_tops_test.mean(axis=0))

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_test.mean(axis=0), accuracies_tops_test.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/test/{type_standardization}/LaBSE_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_train.mean(axis=0), accuracies_tops_train.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/train/{type_standardization}/LaBSE_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)



Exceution time: 140.41847324371338




[0.4442085  0.50091147 0.52293461]
[0.31733651 0.39712764 0.43135211]


## BETO

In [51]:
# Classification Model
from functions.AC_PLT import AC_PLT
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

model = SentenceTransformer('dccuchile/bert-base-spanish-wwm-cased')

# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),                                       # the number of data points
        cfg.params.vector_length.sentence_embedding      # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.sentence_embedding ]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

accuracies_tops_test = np.zeros((5,3))
accuracies_tops_train = np.zeros((5,3))
tops = (1, 3, 5)


cross_validation = StratifiedKFold(n_splits = 5)

X = data_matrix[:, :cfg.params.vector_length.sentence_embedding] 
y = data_matrix[:, cfg.params.vector_length.sentence_embedding+2]

temp_test_acc = np.zeros(5)

for c, (train_index, test_index) in enumerate(cross_validation.split(X, y)):

    X_train = X[train_index, :]
    y_train = y[train_index]

    X_test = X[test_index, :]
    y_test = y[test_index]
    
    
    classification_model = AC_PLT(n_clusters=cfg.params.kmeans.n_cluster[dataset_name])

    classification_model.fit(X_train, y_train)

    pred_ranking_test = classification_model.suggestions(X_test, n_codes=5)
    pred_ranking_train = classification_model.suggestions(X_train, n_codes=5)

    top1_acc_test = np.zeros(len(y_test))
    top3_acc_test = np.zeros(len(y_test))
    top5_acc_test = np.zeros(len(y_test))

    
    for j in range(len(y_test)):
        top1_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[0]])
        top3_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[1]])
        top5_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[2]])


    accuracies_tops_test[c] = [np.mean(top1_acc_test), np.mean(top3_acc_test), np.mean(top5_acc_test)]



    top1_acc_train = np.zeros(len(y_train))
    top3_acc_train = np.zeros(len(y_train))
    top5_acc_train = np.zeros(len(y_train))

    
    for j in range(len(y_train)):
        top1_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[0]])
        top3_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[1]])
        top5_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[2]])
    
    accuracies_tops_train[c] = [np.mean(top1_acc_train), np.mean(top3_acc_train), np.mean(top5_acc_train)]



print(accuracies_tops_train.mean(axis=0))
print(accuracies_tops_test.mean(axis=0))

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_test.mean(axis=0), accuracies_tops_test.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/test/{type_standardization}/BETO_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_train.mean(axis=0), accuracies_tops_train.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/train/{type_standardization}/BETO_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

No sentence-transformers model found with name C:\Users\dra98/.cache\torch\sentence_transformers\dccuchile_bert-base-spanish-wwm-cased. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\dra98/.cache\torch\sentence_transformers\dccuchile_bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Exceution time: 142.71896767616272




[0.36750736 0.39241634 0.40689587]
[0.22904417 0.26630713 0.29182264]


## E5

In [52]:
model = SentenceTransformer('intfloat/e5-base-v2')

# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),                                       # the number of data points
        cfg.params.vector_length.sentence_embedding      # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.sentence_embedding ]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)


# Classification Model
from functions.AC_PLT import AC_PLT
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold



accuracies_tops_test = np.zeros((5,3))
accuracies_tops_train = np.zeros((5,3))
tops = (1, 3, 5)


cross_validation = StratifiedKFold(n_splits = 5)

X = data_matrix[:, :cfg.params.vector_length.sentence_embedding] 
y = data_matrix[:, cfg.params.vector_length.sentence_embedding+2]

temp_test_acc = np.zeros(5)

for c, (train_index, test_index) in enumerate(cross_validation.split(X, y)):

    X_train = X[train_index, :]
    y_train = y[train_index]

    X_test = X[test_index, :]
    y_test = y[test_index]
    
    
    classification_model = AC_PLT(n_clusters=cfg.params.kmeans.n_cluster[dataset_name])

    classification_model.fit(X_train, y_train)

    pred_ranking_test = classification_model.suggestions(X_test, n_codes=5)
    pred_ranking_train = classification_model.suggestions(X_train, n_codes=5)

    top1_acc_test = np.zeros(len(y_test))
    top3_acc_test = np.zeros(len(y_test))
    top5_acc_test = np.zeros(len(y_test))

    
    for j in range(len(y_test)):
        top1_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[0]])
        top3_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[1]])
        top5_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[2]])


    accuracies_tops_test[c] = [np.mean(top1_acc_test), np.mean(top3_acc_test), np.mean(top5_acc_test)]



    top1_acc_train = np.zeros(len(y_train))
    top3_acc_train = np.zeros(len(y_train))
    top5_acc_train = np.zeros(len(y_train))

    
    for j in range(len(y_train)):
        top1_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[0]])
        top3_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[1]])
        top5_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[2]])
    
    accuracies_tops_train[c] = [np.mean(top1_acc_train), np.mean(top3_acc_train), np.mean(top5_acc_train)]



print(accuracies_tops_train.mean(axis=0))
print(accuracies_tops_test.mean(axis=0))

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_test.mean(axis=0), accuracies_tops_test.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/test/{type_standardization}/E5_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_train.mean(axis=0), accuracies_tops_train.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/train/{type_standardization}/E5_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

Exceution time: 160.47151398658752




[0.51255598 0.5705247  0.59052264]
[0.40178371 0.4896755  0.51822888]


## CANINE

In [53]:
# Abrir csv con vectores numericos.

dataset_name = "cpn27"
# Options are "cpn27" and "cpn120"

type_standardization = "lemmatize" 
# options are "raw", "normalize", "normalize_wo_stop", and "lemmatize"

if type_standardization == "lemmatize":
    data_matrix = pd.read_csv(cfg.path_embedding_canine[dataset_name][type_standardization], delimiter=",",index_col="Unnamed: 0").to_numpy()
else:
    data_matrix = pd.read_csv(cfg.path_embedding_canine[dataset_name][type_standardization], delimiter=",").to_numpy()


In [54]:

# Classification Model
from functions.AC_PLT import AC_PLT
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold



accuracies_tops_test = np.zeros((5,3))
accuracies_tops_train = np.zeros((5,3))
tops = (1, 3, 5)


cross_validation = StratifiedKFold(n_splits = 5)

X = data_matrix[:, :cfg.params.vector_length.sentence_embedding] 
y = data_matrix[:, cfg.params.vector_length.sentence_embedding+2]

temp_test_acc = np.zeros(5)

for c, (train_index, test_index) in enumerate(cross_validation.split(X, y)):

    X_train = X[train_index, :]
    y_train = y[train_index]

    X_test = X[test_index, :]
    y_test = y[test_index]
    
    
    classification_model = AC_PLT(n_clusters=cfg.params.kmeans.n_cluster[dataset_name])

    classification_model.fit(X_train, y_train)

    pred_ranking_test = classification_model.suggestions(X_test, n_codes=5)
    pred_ranking_train = classification_model.suggestions(X_train, n_codes=5)

    top1_acc_test = np.zeros(len(y_test))
    top3_acc_test = np.zeros(len(y_test))
    top5_acc_test = np.zeros(len(y_test))

    
    for j in range(len(y_test)):
        top1_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[0]])
        top3_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[1]])
        top5_acc_test[j] = int(y_test[j] in pred_ranking_test[j, :tops[2]])


    accuracies_tops_test[c] = [np.mean(top1_acc_test), np.mean(top3_acc_test), np.mean(top5_acc_test)]



    top1_acc_train = np.zeros(len(y_train))
    top3_acc_train = np.zeros(len(y_train))
    top5_acc_train = np.zeros(len(y_train))

    
    for j in range(len(y_train)):
        top1_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[0]])
        top3_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[1]])
        top5_acc_train[j] = int(y_train[j] in pred_ranking_train[j, :tops[2]])
    
    accuracies_tops_train[c] = [np.mean(top1_acc_train), np.mean(top3_acc_train), np.mean(top5_acc_train)]



print(accuracies_tops_train.mean(axis=0))
print(accuracies_tops_test.mean(axis=0))

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_test.mean(axis=0), accuracies_tops_test.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/test/{type_standardization}/canine_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)

pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies_tops_train.mean(axis=0), accuracies_tops_train.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../../../data/experiment-top5/experiment_1/train/{type_standardization}/canine_kmeans_{dataset_name}_{type_standardization}_top5.csv', index=False)



[0.30052671 0.30579202 0.31065221]
[0.16464443 0.17375722 0.18104857]


In [55]:
cfg.params.kmeans.n_cluster[dataset_name]

550