In [1]:
import pandas as pd
import numpy as np
import random
import time
from scipy.stats import mode
import sys
import os

sys.path.append(os.path.join(".."))
# Word Embedding
from sentence_transformers import SentenceTransformer

# Created functions
from functions.Experimentations import ParamSearch
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import Counter 

def most_frequent(self, List:list) -> list: 
    """
    Recives a list of words, and return the word most frequente of
    the list
    """
    # counter of occurence of a code in a list
    occurence_count = Counter(List) 

    # Return the first code with more occurence
    return occurence_count.most_common

In [3]:
vector_length = 768

In [4]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn27"
file_path = '../data/raw_dataset/normalize/'

if dataset_name == "cpn27":
    # data = pd.read_csv(file_path+'CPN27_lemma.csv', delimiter=",")
    data = pd.read_csv(file_path+'CPN27_normalize.csv', delimiter=",")
    
elif dataset_name == "cpn120":
    data = pd.read_csv(file_path+'CPN120_normalize.csv', delimiter=",")

# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")

data.fillna(value='')


Unnamed: 0,Palabra (concepto),Descripción,Codificación
0,Compasión,sentimiento,sentimiento
1,Compasión,por lo que se le perdona la vida a alguien en ...,perdón
2,plan,necesario para cumplir objetivos,objetivos
3,plan,organizar recursos o personas,organización
4,plan,estrategia,estrategia
...,...,...,...
4933,Obligación,imposicion,imposición
4934,Obligación,carácter extricto,extricto
4935,Obligación,normas,normas
4936,Obligación,deberes,deber


In [20]:
# model = SentenceTransformer('intfloat/multilingual-e5-base')
model = SentenceTransformer('intfloat/e5-base-v2')

In [21]:
# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),          # the number of data points
        vector_length       # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:vector_length]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Exceution time: 151.56394481658936


In [22]:
def acc_top_n(pred_val, true_val, top = 1):
    acc = 0
    for i in range(len(true_val)):
        occurence_count = Counter(pred_val[i]) 
        for code in occurence_count.most_common(top):
            if code[0] == true_val[i]:
                acc += 1
                break
            else:
                pass
    return acc

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold

accuracies = np.zeros((5,3))

cross_validation = StratifiedKFold(n_splits = 5)

X = data_matrix[:, :vector_length] 
y = data_matrix[:, vector_length+2]


for c, (train_index, test_index) in enumerate(cross_validation.split(X, y)):


    X_train = X[train_index, :]
    y_train = y[train_index]

    X_test = X[test_index, :]
    y_test = y[test_index]

    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(X_train, y_train)

    distance, index = knn.kneighbors(X=X_test, n_neighbors=10)

    pred_ranking = y_train[index]

        
    accuracies[c][0]=acc_top_n(pred_ranking, y_test, 1)/len(y_test)
    accuracies[c][1]=acc_top_n(pred_ranking, y_test, 3)/len(y_test)
    accuracies[c][2]=acc_top_n(pred_ranking, y_test, 5)/len(y_test)

    # break



In [24]:
pd.DataFrame(
    np.array([['top-1', 'top-3', 'top-5'], accuracies.mean(axis=0), accuracies.std(axis=0)]).T,
    columns=['top position', 'mean accuracy', 'standar desviation']
    ).to_csv(f'../data/experiment-top5/{dataset_name}_top5.csv', index=False)