# CPN generator

In [34]:
import pandas as pd
import numpy as np
import spacy
import gensim
from collections import Counter 
import time
import sklearn.cluster
import random

random.seed(0)

In [2]:
# data import
data = pd.read_excel(r'../data/Datos Codificados Estudio v2a.xlsx')
model = gensim.models.KeyedVectors.load_word2vec_format( 
    r'../Data/SBW-vectors-300-min5.bin', # using the spanish billion words embeddings
    binary=True # the model is in binary format
)

In [3]:
# Configuracion de spacy
nlp = spacy.load('es_core_news_sm')
# nlp.Defaults.stop_words.add("my_new_stopword")

In [4]:
# Definitivamente mostrar al profe y ver que se hace con esto
print(nlp.Defaults.stop_words)

{'breve', 'mejor', 'consiguen', 'mas', 'primer', 'se', 'existe', 'igual', 'usas', 'aquello', 'estado', 'buen', 'hacen', 'va', 'fue', 'muy', 'todo', 'despacio', 'entonces', 'grandes', 'cosas', 'enfrente', 'pasada', 'aún', 'cierto', 'hace', 'míos', 'ciertos', 'señaló', 'trabajas', 'primeros', 'claro', 'cuanta', 'decir', 'explicó', 'habla', 'mí', 'usar', 'primero', 'solas', 'bien', 'mias', 'alguna', 'usamos', 'dejó', 'manera', 'excepto', 'nosotros', 'nuestro', 'vuestras', 'nuestras', 'él', 'soyos', 'algo', 'trata', 'ambos', 'temprano', 'éste', 'cuántas', 'día', 'tenemos', 'largo', 'una', 'algún', 'misma', 'aquél', 'un', 'era', 'dan', 'horas', 'han', 'cuenta', 'pueda', 'detras', 'seis', 'estuvo', 'tuyas', 'esas', 'partir', 'último', 'dieron', 'dar', 'estaba', 'ninguno', 'encuentra', 'solamente', 'vaya', 'bajo', 'mientras', 'sabe', 'detrás', 'aquéllos', 'también', 'mediante', 'menos', 'dias', 'todas', 'usa', 'sabes', 'total', 'estará', 'ayer', 'consigue', 'fueron', 'quiere', 'según', 'dia',

## Funciones Importantes

In [5]:
def Normalize(text):
    """
    Recive a string and return the same string without punctions and 
    stop words
    """
     # Create a object spacy type nlp
    doc = nlp(text)

    # Remove all special caracters 
    words = [t.orth_ for t in doc if not (t.is_punct or t.is_stop)]
    tokens = [t.lower() for t in words]

    return(" ".join(tokens))

def Lematize(text):
    """
    Recive a string and return the string Lematized
    """
    doc = nlp(text)

    # Remove all special caracters 
    lemas = [t.lemma_ for t in tokens]

    return(" ".join(lemas))

def to_vector(texto,model):
    """ 
    Receives a sentence string along with a word embedding model and 
    returns the vector representation of the sentence
    """
    tokens = texto.split() # splits the text by space and returns a list of words
    vec = np.zeros(300) # creates an empty vector of 300 dimensions
    for word in tokens: # iterates over the sentence
        if word in model: # checks if the word is both in the word embedding
            vec += model[word] # adds every word embedding to the vector
    return vec / np.linalg.norm(vec) # divides the vector by their normal


# Function that count the most frecuency value of each cluster  


In [32]:
Normalize("Hola a todos los últimos!!")

'hola a'

In [6]:
pd.options.mode.chained_assignment = None # Para evitar que se muestre warnings
# Normalizacion de los conceptos, descripciones y codificaciones
for i in range(len(data)):
    data['Palabra (concepto)'][i] = Normalize(data['Palabra (concepto)'][i])
    data['Descripción'][i] = Normalize(data['Descripción'][i]) 
    data['Codificación'][i] = Normalize(data['Codificación'][i])

## Generación de Matriz de los datos

In [7]:
# Creation of the matrix to make the clustering process
descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data), # the number of data points
        len(model['hola']) # the number of components of the word embedding
    )
)
# matrix filling 
for i,description in enumerate(data['Descripción']):
    vector = to_vector(description,model)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)

#--------------------------------------------------------------------------------------
# This is the important matrix
# Remove of the 'Nan' data
data_matrix_without_nan = data_matrix[~pd.isnull(data_matrix[:,:300]).any(axis=1)]
#--------------------------------------------------------------------------------------

# reduce the matrix to the importan
data_matrix_without_nan = np.concatenate(
    (data_matrix_without_nan[:,:300],                       # Vector of the description
    np.reshape(data_matrix_without_nan[:,302], (-1, 1)),    # Cue/Concept
    np.reshape(data_matrix_without_nan[:,306], (-1, 1)),    # Codification
    np.reshape(data_matrix_without_nan[:,303], (-1, 1))     # Description of the cue
    ),
    axis = 1
)

  return vec / np.linalg.norm(vec) # divides the vector by their normal


## Model Creation

In [9]:
# Class creation
class PLT_processor:

    def __init__(self, method):
        self.method = method # Method of Processing the data
        self.KMeans_dict = {}
        self.KMeans_categories = {}

        self.topk = np.zeros((test.shape[0],500))

    def most_frequent(self, List): 
        """
        Recives a list of words, and return the word most frequente of
        the list
        """
        # ? Agregar la frase 'PENDIENTE'?
        occurence_count = Counter(List) 
        return occurence_count.most_common(1)[0][0] 

    def random_labels(self, train, test):
        """
        Recives two dataframes and return 
        """
        
        return

    def model_train(self, train, n_clusters):
        start = time.time()
        self.kemean = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=0)
        self.kemean.fit(train[:,:300])
        end = time.time()
        print("K-means training process:",end - start)

        # Dataframe for 
        df = pd.DataFrame(
            np.concatenate([
                np.reshape(train[:,301], (-1, 1)), 
                np.reshape(self.kemean.labels_, (-1, 1)),
                np.reshape(train[:,300], (-1, 1))
                ], axis=1), 
            columns=['Human', 'KMeans', 'Concept'])
            
        self.KMeans_dict = df.groupby(by='KMeans')['Human'].apply(list).to_dict()
        for key, val in self.KMeans_dict.items():
            self.KMeans_categories[key] = self.most_frequent(val)
        df['KM_Prediction'] = df['KMeans'].map(self.KMeans_categories)
        print('Accuracy training: ',sum(df['Human'] == df['KM_Prediction'])/ len(df['Human']))

        if self.method == 1:
            start = time.time()
            # dictionary of clusters for each concept
            inverse_concept_cluster = {
                i:[False if j in np.unique(df['KMeans'][df['Concept']==i]) else True for j in range(500)] 
                for i in np.unique(df['Concept'])
                }
            concept_cluster = {
                i:np.unique(df['KMeans'][df['Concept']==i]) for i in np.unique(df['Concept'])
                }
            end = time.time()
            print("Time getting the clusters of each cue:",end - start)

        # return self.KMeans_categories, self.KMeans_dict
    
    def get_distances(self, test):
        start = time.time()
        #Calculating the distance maatrix of each data test point to each cluster
        distance_matrix = np.zeros((test.shape[0],500))

        for i in range(distance_matrix.shape[1]):    
            # Euclidian distance
            distance_matrix[:,i]=np.sqrt(np.sum((test[:,:300].astype(np.float)-self.kemean.cluster_centers_[i,:])**2,axis=1))

        # Sorting distances
        self.topk=np.argsort(distance_matrix,axis=1)
        end = time.time()
        print("Time calculating distances and sorting indexes:",end - start)

        return self.topk
    
    def set_labels(self):
        start = time.time()
        # Change of the numeric value to the codification 
        self.topKS=pd.DataFrame(self.topk)
        tempData=np.array(list(self.KMeans_categories.values()))
        for j in range(self.topKS.shape[1]):
            self.topKS.iloc[:,j]=tempData[self.topk[:,j]]

            # Agregar para Experimento 1
            # # Remove of the clusters that not share the same concept
            # topKS.iloc[i,~np.isin(topk[i,:],data_test[0].map(concept_cluster).iloc[i])] = np.nan

        end = time.time()
        print("Changing numerics values to the label:",end - start)

    def get_accuracies(self, test):
        start = time.time()    
        #Creating the accuracy table to check each data point
        testLabel=np.zeros(self.topKS.shape)
        indexes_method0=pd.DataFrame(np.zeros((self.topKS.shape[0],2)), columns=['index', 'value']) 

        #For each data point
        for i in range(testLabel.shape[0]):
            #Checking if some of the cluster is able to classify it right
            boolClass=self.topKS.iloc[i,:]==test[i,301]
            if sum(boolClass)>0:
                getIndex=boolClass.idxmax()
                indexes_method0.iloc[i,0] = getIndex
                indexes_method0.iloc[i,1] = self.topKS.iloc[i,getIndex]
                #Setting the rest of the data point as 1
                testLabel[i,getIndex:]=1
            else:
                indexes_method0.iloc[i,0] = np.nan
                indexes_method0.iloc[i,1] = np.nan
        accuracies=testLabel.sum(axis=0)/testLabel.shape[0]
        end = time.time()
        print("Calculating accuracies:",end - start)

        return accuracies

    def process(self, test, train):
        self.model_train(train, 500)
        self.get_distances(test)
        self.set_labels()
        return self.get_accuracies(test)


In [15]:
proc = PLT_processor(0)
acc = proc.process(test, train)

K-means training process: 46.72390794754028
Accuracy training:  0.4980402403971779
Time calculating distances and sorting indexes: 5.512334823608398
Changing numerics values to the label: 7.82807731628418
Calculating accuracies: 1.7740018367767334


## Test-Train Generation

In [8]:
# Generation of test-train data

from sklearn.model_selection import KFold

kfold=KFold(n_splits=5)

In [45]:
rand_acc = []
for train_index, test_index in kfold.split(data_matrix_without_nan):
    
    train = data_matrix_without_nan[train_index,:]
    test = data_matrix_without_nan[test_index,:]

    train_codes=np.unique(train[:,301])

    len_test = len(test)
    counter = 0
    for i in range(len_test):
        counter += test[i,301] == np.random.choice(train_codes)
    rand_acc.append(counter/len_test)

np.mean(rand_acc)


0.0012543557053728747

In [42]:
rand_concept_acc = []
for train_index, test_index in kfold.split(data_matrix_without_nan):

    train = data_matrix_without_nan[train_index,:]
    test = data_matrix_without_nan[test_index,:]

    concept_cluster = {
                i:np.unique(train[:,301][train[:,300]==i]) for i in np.unique(train[:,300])
                }
    
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        counter += test[i,301] == np.random.choice(concept_cluster[test[i,300]])
    rand_concept_acc.append(counter/len_test)

np.mean(rand_concept_acc)


0.009824766201912357