# CPN generator

In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
from collections import Counter 
import time
import sklearn.cluster
import random

random.seed(0)

In [2]:
# data import
data = pd.read_excel(r'../data/Datos Codificados Estudio v2a.xlsx')
model = gensim.models.KeyedVectors.load_word2vec_format( 
    r'../Data/SBW-vectors-300-min5.bin', # using the spanish billion words embeddings
    binary=True # the model is in binary format
)

In [3]:
# Configuracion de spacy
nlp = spacy.load('es_core_news_sm')
# nlp.Defaults.stop_words.add("my_new_stopword")

## Funciones Importantes

In [4]:

def clean_text(text):
    """
    Recive a string and return the string in tokens without punctuations
    and in lowercase
    """
    # for each token in the sentence add to the list if is not a punctuation
    return [t for t in nlp(text.lower()) if not t.is_punct]


def normalize(tokens):
    """
    Recive al list of string and return in one string without stop words
    """
    # for each token if is not a stop word add the word to the list
    words = [t.orth_ for t in tokens if not t.is_stop]

    # return the tokens in one string
    return(" ".join(words))


# TODO: rehacer el proceso de Lematización
def lematize(tokens):
    """
    Recive al list of string and return in one string without stop words 
    and Lematized
    """
    # Remove all special caracters 
    lemmas = [t.lemma_ for t in tokens if not t.is_stop]

    # return the tokens in one string
    return(" ".join(lemmas))

def to_vector(texto,model):
    """ 
    Receives a sentence string along with a word embedding model and 
    returns the vector representation of the sentence
    """
    tokens = texto.split() # splits the text by space and returns a list of words
    vec = np.zeros(300) # creates an empty vector of 300 dimensions
    for word in tokens: # iterates over the sentence
        if word in model: # checks if the word is both in the word embedding
            vec += model[word] # adds every word embedding to the vector
    return vec / np.linalg.norm(vec) # divides the vector by their normal


# Preprocesamiento de los datos

In [5]:
pd.options.mode.chained_assignment = None # Para evitar que se muestre warnings

# Normalizacion de los conceptos, descripciones y codificaciones
for i in range(len(data)):
    data['Palabra (concepto)'][i] = normalize(clean_text(data['Palabra (concepto)'][i]))
    data['Descripción'][i] = normalize(clean_text(data['Descripción'][i]))
    data['Codificación'][i] = normalize(clean_text(data['Codificación'][i]))

## Generación de Matriz de los datos

In [6]:
# Creation of the matrix to make the clustering process
descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data), # the number of data points
        len(model['hola']) # the number of components of the word embedding
    )
)
# matrix filling 
for i,description in enumerate(data['Descripción']):
    vector = to_vector(description,model)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)

#--------------------------------------------------------------------------------------
# This is the important matrix
# Remove of the 'Nan' data
data_matrix_without_nan = data_matrix[~pd.isnull(data_matrix[:,:300]).any(axis=1)]
#--------------------------------------------------------------------------------------

# reduce the matrix to the importan
data_matrix_without_nan = np.concatenate(
    (data_matrix_without_nan[:,:300],                       # Vector of the description
    np.reshape(data_matrix_without_nan[:,302], (-1, 1)),    # Cue/Concept
    np.reshape(data_matrix_without_nan[:,306], (-1, 1)),    # Codification
    np.reshape(data_matrix_without_nan[:,303], (-1, 1))     # Description of the cue
    ),
    axis = 1
)

  return vec / np.linalg.norm(vec) # divides the vector by their normal


## Testing random generations

In [8]:
# Generation of test-train data

from sklearn.model_selection import KFold

kfold=KFold(n_splits=5)

In [13]:
rand_acc = []
for train_index, test_index in kfold.split(data_matrix_without_nan):
    
    train = data_matrix_without_nan[train_index,:]
    test = data_matrix_without_nan[test_index,:]

    train_codes=np.unique(train[:,301])

    len_test = len(test)
    counter = 0
    for i in range(len_test):
        counter += test[i,301] == np.random.choice(train_codes)
    rand_acc.append(counter/len_test)

print(np.mean(rand_acc))
print(np.std(rand_acc))


0.0012545743104104092
0.0010250251045891628


In [11]:
rand_concept_acc = []
for train_index, test_index in kfold.split(data_matrix_without_nan):

    train = data_matrix_without_nan[train_index,:]
    test = data_matrix_without_nan[test_index,:]

    concept_cluster = {
                i:np.unique(train[:,301][train[:,300]==i]) for i in np.unique(train[:,300])
                }
    
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        counter += test[i,301] == np.random.choice(concept_cluster[test[i,300]])
    rand_concept_acc.append(counter/len_test)

print(np.mean(rand_concept_acc))
print(np.std(rand_concept_acc))


0.008152219059736013
0.001386527089724892


## Model Creation

In [15]:
# Class creation
class PLT_processor:

    def __init__(self, method = 0):
        """
        Recives the type of method to use, 
        - 0: Clasify using K-Means (Simple)
        - 1: Clasify to the codification with the same concept 
        """
        self.method = method # Method of Processing the data
        self.KMeans_dict = {}
        self.KMeans_categories = {}

        self.topk = np.zeros((test.shape[0],500))

    def most_frequent(self, List): 
        """
        Recives a list of words, and return the word most frequente of
        the list
        """
        # ? Agregar la frase 'PENDIENTE' en caso de que
        # ? haya multiples codificaciones con la misma frecuencia ?
        occurence_count = Counter(List) 
        return occurence_count.most_common(1)[0][0] 


    def model_train(self, train, n_clusters):
        """
        Recives the train dataset and the number of clusters to train 
        the k-means model
        """
        start = time.time()
        self.kemean = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=0)
        self.kemean.fit(train[:,:300])
        end = time.time()
        # print("K-means training process:",end - start)

        # Dataframe for 
        df = pd.DataFrame(
            np.concatenate([
                np.reshape(train[:,301], (-1, 1)), 
                np.reshape(self.kemean.labels_, (-1, 1)),
                np.reshape(train[:,300], (-1, 1))
                ], axis=1), 
            columns=['Human', 'KMeans', 'Concept'])
            
        self.KMeans_dict = df.groupby(by='KMeans')['Human'].apply(list).to_dict()
        for key, val in self.KMeans_dict.items():
            self.KMeans_categories[key] = self.most_frequent(val)
        df['KM_Prediction'] = df['KMeans'].map(self.KMeans_categories)
        # print('Accuracy training: ',sum(df['Human'] == df['KM_Prediction'])/ len(df['Human']))

        if self.method == 1:
            start = time.time()
            # dictionary of clusters for each concept
            inverse_concept_cluster = {
                i:[False if j in np.unique(df['KMeans'][df['Concept']==i]) else True for j in range(500)] 
                for i in np.unique(df['Concept'])
                }
            concept_cluster = {
                i:np.unique(df['KMeans'][df['Concept']==i]) for i in np.unique(df['Concept'])
                }
            end = time.time()
            # print("Time getting the clusters of each cue:",end - start)

    
    def get_distances(self, test):
        """
        recives the test data to calculate the distances of each frase, return 
        a matrix with the distances sorted
        """
        start = time.time()

        # Distance matrix of each test point to each cluster center
        distance_matrix = np.zeros((test.shape[0],500))

        # for each cluster center
        for i in range(distance_matrix.shape[1]):    
            # Euclidian distance of each point to the i-est cluster center
            distance_matrix[:,i]=np.sqrt(np.sum((test[:,:300].astype(np.float)-self.kemean.cluster_centers_[i,:])**2,axis=1))

        # Sorting distances
        self.topk=np.argsort(distance_matrix,axis=1)
        end = time.time()
        # print("Time calculating distances and sorting indexes:",end - start)

    
    def set_labels(self):
        """
        Create a new matrix from the clusters sorted and change the value
        from numeric to the string according the codification
        """
        start = time.time()
        # Change of the numeric value to the codification 
        self.topKS=pd.DataFrame(self.topk)
        tempData=np.array(list(self.KMeans_categories.values()))
        for j in range(self.topKS.shape[1]):
            self.topKS.iloc[:,j]=tempData[self.topk[:,j]]

            # Agregar para Experimento 1
            # # Remove of the clusters that not share the same concept
            # topKS.iloc[i,~np.isin(topk[i,:],data_test[0].map(concept_cluster).iloc[i])] = np.nan

        end = time.time()
        # print("Changing numerics values to the label:",end - start)

    def get_accuracies(self, test):
        """
        Recives the test matrix and return the accuracies of the 
        diferents predictions
        """
        start = time.time()    
        #Creating the accuracy table to check each data point
        testLabel=np.zeros(self.topKS.shape)
        indexes_method0=pd.DataFrame(np.zeros((self.topKS.shape[0],2)), columns=['index', 'value']) 

        #For each data point
        for i in range(testLabel.shape[0]):
            #Checking if some of the cluster is able to classify it right
            boolClass=self.topKS.iloc[i,:]==test[i,301]
            if sum(boolClass)>0:
                getIndex=boolClass.idxmax()
                indexes_method0.iloc[i,0] = getIndex
                indexes_method0.iloc[i,1] = self.topKS.iloc[i,getIndex]
                #Setting the rest of the data point as 1
                testLabel[i,getIndex:]=1
            else:
                indexes_method0.iloc[i,0] = np.nan
                indexes_method0.iloc[i,1] = np.nan
        accuracies=testLabel.sum(axis=0)/testLabel.shape[0]
        end = time.time()
        # print("Calculating accuracies:",end - start)

        return accuracies

    def process(self, train, test):
        self.model_train(train, 500)
        self.get_distances(test)
        self.set_labels()
        return self.get_accuracies(test)


In [19]:
from sklearn.model_selection import KFold

kfold=KFold(n_splits=5)

method0_acc = np.zeros(5)
i=0

for train_index, test_index in kfold.split(data_matrix_without_nan):
    train = data_matrix_without_nan[train_index, :]
    test = data_matrix_without_nan[test_index, :]

    proc = PLT_processor(0)
    acc = proc.process(train, test)

    method0_acc[i] = acc[0]
    i+=1

K-means training process: 19.020500659942627
Accuracy training:  0.7481713688610241
Time calculating distances and sorting indexes: 25.88062810897827
Changing numerics values to the label: 32.96566295623779
Calculating accuracies: 8.335387945175171
K-means training process: 13.669280529022217
Accuracy training:  0.7366771159874608
Time calculating distances and sorting indexes: 25.39811873435974
Changing numerics values to the label: 30.53159189224243
Calculating accuracies: 8.151011228561401
K-means training process: 12.505334615707397
Accuracy training:  0.7429467084639498
Time calculating distances and sorting indexes: 23.760058879852295
Changing numerics values to the label: 30.592565059661865
Calculating accuracies: 7.9168572425842285
K-means training process: 13.809251546859741
Accuracy training:  0.7189132706374086
Time calculating distances and sorting indexes: 22.428674459457397
Changing numerics values to the label: 29.6474928855896
Calculating accuracies: 7.359650373458862
K

In [20]:
print(method0_acc.mean())
print(method0_acc.std())

0.2919095034756893
0.011061183762064618


In [14]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_matrix_without_nan)

proc = PLT_processor()
acc = proc.process(train, test)