# CPN generator

In [18]:
import pandas as pd
import numpy as np
import spacy
import gensim
from collections import Counter 
import time
import sklearn

In [2]:
# data import
data = pd.read_excel(r'../data/Datos Codificados Estudio v2a.xlsx')
model = gensim.models.KeyedVectors.load_word2vec_format( 
    r'../Data/SBW-vectors-300-min5.bin', # using the spanish billion words embeddings
    binary=True # the model is in binary format
)

In [3]:
# Configuracion de spacy
nlp = spacy.load('es_core_news_sm')
# nlp.Defaults.stop_words.add("my_new_stopword")

In [7]:
# Definitivamente mostrar al profe y ver que se hace con esto
print(nlp.Defaults.stop_words)

{'cosas', 'pronto', 'sean', 'aun', 'empleas', 'tiempo', 'llegó', 'trabajar', 'queremos', 'verdad', 'habrá', 'aquél', 'pero', 'cuál', 'más', 'primeros', 'trabajo', 'veces', 'trabajais', 'sola', 'hablan', 'mio', 'dan', 'excepto', 'ella', 'menos', 'podrias', 'durante', 'manera', 'cuanta', 'sigue', 'mías', 'dice', 'debido', 'informó', 'señaló', 'ultimo', 'nuestros', 'sería', 'intentar', 'deben', 'se', 'entre', 'considera', 'esa', 'estoy', 'mia', 'mía', 'cuantos', 'esta', 'haciendo', 'vaya', 'ninguno', 'tercera', 'cuatro', 'además', 'soyos', 'ambos', 'tambien', 'ellas', 'llevar', 'propias', 'despacio', 'bastante', 'podriais', 'muy', 'quien', 'empleo', 'pueda', 'conseguimos', 'podriamos', 'podrán', 'le', 'saben', 'sobre', 'poner', 'aún', 'usan', 'parte', 'tendrán', 'tengo', 'sera', 'da', 'eras', 'van', 'nueva', 'ademas', 'expresó', 'míos', 'seis', 'hicieron', 'sabe', 'eramos', 'hubo', 'lugar', 'propio', 'usar', 'nuestra', 'dar', 'que', 'apenas', 'repente', 'vuestros', 'encuentra', 'fuera', '

## Funciones Importantes

In [4]:
def Normalize(text):
    """
    Recive a string and return the same string without punctions and 
    stop words
    """
     # Create a object spacy type nlp
    doc = nlp(text)

    # Remove all special caracters 
    words = [t.orth_ for t in doc if not (t.is_punct or t.is_stop)]
    tokens = [t.lower() for t in words]

    return(" ".join(tokens))

def Lematize(text):
    """
    Recive a string and return the string Lematized
    """
    doc = nlp(text)

    # Remove all special caracters 
    lemas = [t.lemma_ for t in tokens]

    return(" ".join(lemas))

def to_vector(texto,model):
    """ 
    Receives a sentence string along with a word embedding model and 
    returns the vector representation of the sentence
    """
    tokens = texto.split() # splits the text by space and returns a list of words
    vec = np.zeros(300) # creates an empty vector of 300 dimensions
    for word in tokens: # iterates over the sentence
        if word in model: # checks if the word is both in the word embedding
            vec += model[word] # adds every word embedding to the vector
    return vec / np.linalg.norm(vec) # divides the vector by their normal


# Function that count the most frecuency value of each cluster  


In [32]:
Normalize("Hola a todos los últimos!!")

'hola a'

In [5]:
pd.options.mode.chained_assignment = None # Para evitar que se muestre warnings
# Normalizacion de los conceptos, descripciones y codificaciones
for i in range(len(data)):
    data['Palabra (concepto)'][i] = Normalize(data['Palabra (concepto)'][i])
    data['Descripción'][i] = Normalize(data['Descripción'][i]) 
    data['Codificación'][i] = Normalize(data['Codificación'][i])

## Generación de Matriz de los datos

In [8]:
# Creation of the matrix to make the clustering process
descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data), # the number of data points
        len(model['hola']) # the number of components of the word embedding
    )
)
# matrix filling 
for i,description in enumerate(data['Descripción']):
    vector = to_vector(description,model)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)

#--------------------------------------------------------------------------------------
# This is the important matrix
# Remove of the 'Nan' data
data_matrix_without_nan = data_matrix[~pd.isnull(data_matrix[:,:300]).any(axis=1)]
#--------------------------------------------------------------------------------------

# reduce the matrix to the importan
data_matrix_without_nan = np.concatenate(
    (data_matrix_without_nan[:,:300],                       # Vector of the description
    np.reshape(data_matrix_without_nan[:,302], (-1, 1)),    # Cue/Concept
    np.reshape(data_matrix_without_nan[:,306], (-1, 1)),    # Codification
    np.reshape(data_matrix_without_nan[:,303], (-1, 1))     # Description of the cue
    ),
    axis = 1
)

  return vec / np.linalg.norm(vec) # divides the vector by their normal


## Test-Train Generation

In [9]:
# Generation of test-train data
#? Es necesario hacer solo el split o un cross-validation?
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import KFold

train, test = train_test_split(data_matrix_without_nan, test_size = 0.2)

## Model Creation

In [None]:
#----------------------------------------------------------------------------------------
# Training the model over the dataset
start = time.time()
kemean = sklearn.cluster.KMeans(n_clusters = 500,random_state=0)
kemean.fit(train[:,:300])
end = time.time()
print("K-means training process:",end - start)
#----------------------------------------------------------------------------------------

#----------------------------------------------------------------------------------------
# Creating a dataframe with real classification => Human
# The assigned cluster => Kmeans
# The original concept => Concepts
start = time.time()
df = pd.DataFrame(
    np.concatenate([
        np.reshape(train[:,301], (-1, 1)), 
        np.reshape(kemean.labels_, (-1, 1)),
        np.reshape(train[:,300], (-1, 1))
        ], axis=1), 
    columns=['Human', 'KMeans', 'Concept'])

# Gathering all classes corrsponding to cluster
KMeans_dict = df.groupby(by='KMeans')['Human'].apply(list).to_dict()
KMeans_categories = {}  

# Creación del data frame usando los datos de entrenamiento
# El data frame tiene predicción humana, cluster asociado, concepto, y predicción k-means
for key, val in KMeans_dict.items():
    KMeans_categories[key] = most_frequent(val)
df['KM_Prediction'] = df['KMeans'].map(KMeans_categories)
print('Accuracy training: ',sum(df['Human'] == df['KM_Prediction'])/ len(df['Human']))
end = time.time()
print("Data frames and others:",end - start)
#----------------------------------------------------------------------------------------

# Solo para metodo 1
#----------------------------------------------------------------------------------------
# Proceso de obtencion de los conceptos
start = time.time()
# dictionary of clusters for each concept
inverse_concept_cluster = {
    i:[False if j in np.unique(df['KMeans'][df['Concept']==i]) else True for j in range(500)] 
    for i in np.unique(df['Concept'])
    }
concept_cluster = {
    i:np.unique(df['KMeans'][df['Concept']==i]) for i in np.unique(df['Concept'])
    }
end = time.time()
print("Time getting the clusters of each cue:",end - start)
#----------------------------------------------------------------------------------------


#----------------------------------------------------------------------------------------
start = time.time()
#Calculating the distance maatrix of each data test point to each cluster
distance_matrix = np.zeros((test.shape[0],500))
index=0
for i in range(distance_matrix.shape[1]):    
    # Euclidian distance
    distance_matrix[:,i]=np.sqrt(np.sum(
        (test[:,:300].astype(np.float)-kemean.cluster_centers_[i,:])**2,axis=1
        ))

# Sorting distances
topk=np.argsort(distance_matrix,axis=1)
end = time.time()
print("Time calculating distances and sorting indexes:",end - start)
#----------------------------------------------------------------------------------------

#----------------------------------------------------------------------------------------
start = time.time()
# Change of the numeric value to the codification 
topKS=pd.DataFrame(topk)
tempData=np.array(list(KMeans_categories.values()))
for j in range(topKS.shape[1]):
    topKS.iloc[:,j]=tempData[topk[:,j]]

    # Agregar para Experimento 1
    # # Remove of the clusters that not share the same concept
    # topKS.iloc[i,~np.isin(topk[i,:],data_test[0].map(concept_cluster).iloc[i])] = np.nan

end = time.time()
print("Changing numerics values to the label:",end - start)
#----------------------------------------------------------------------------------------

#----------------------------------------------------------------------------------------
start = time.time()    
#Creating the accuracy table to check each data point
testLabel=np.zeros(topKS.shape)
indexes_method0=pd.DataFrame(np.zeros((topKS.shape[0],2)), columns=['index', 'value']) 

#For each data point
for i in range(testLabel.shape[0]):
    #Checking if some of the cluster is able to classify it right
    boolClass=topKS.iloc[i,:]==test[i,301]
    if sum(boolClass)>0:
        getIndex=boolClass.idxmax()
        indexes_method0.iloc[i,0] = getIndex
        indexes_method0.iloc[i,1] = topKS.iloc[i,getIndex]
        #Setting the rest of the data point as 1
        testLabel[i,getIndex:]=1
    else:
        indexes_method0.iloc[i,0] = np.nan
        indexes_method0.iloc[i,1] = np.nan
accuracies_m0=testLabel.sum(axis=0)/testLabel.shape[0]
end = time.time()
print("Calculating accuracies:",end - start)
#----------------------------------------------------------------------------------------

ggplot()+aes(x=range(1,501),y=accuracies_m0)+geom_line()

In [29]:
# Class creation
class PLT_processor:

    def __init__(self, method):
        self.method = method # Method of Processing the data

    def most_frequent(self, List): 
        """
        Recives a list of words, and return the word most frequente of
        the list
        """
        occurence_count = Counter(List) 
        return occurence_count.most_common(1)[0][0] 

    def modelTrain(self, train, n_clusters):
        start = time.time()
        self.kemean = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=0)
        self.kemean.fit(train[:,:300])
        end = time.time()
        print("K-means training process:",end - start)

        # Dataframe for 
        df = pd.DataFrame(
            np.concatenate([
                np.reshape(train[:,301], (-1, 1)), 
                np.reshape(self.kemean.labels_, (-1, 1)),
                np.reshape(train[:,300], (-1, 1))
                ], axis=1), 
            columns=['Human', 'KMeans', 'Concept'])
            
        KMeans_dict = df.groupby(by='KMeans')['Human'].apply(list).to_dict()
        KMeans_categories = {}  
        for key, val in KMeans_dict.items():
            KMeans_categories[key] = self.most_frequent(val)
        df['KM_Prediction'] = df['KMeans'].map(KMeans_categories)
        print('Accuracy training: ',sum(df['Human'] == df['KM_Prediction'])/ len(df['Human']))

        if self.method == 1:
            start = time.time()
            # dictionary of clusters for each concept
            inverse_concept_cluster = {
                i:[False if j in np.unique(df['KMeans'][df['Concept']==i]) else True for j in range(500)] 
                for i in np.unique(df['Concept'])
                }
            concept_cluster = {
                i:np.unique(df['KMeans'][df['Concept']==i]) for i in np.unique(df['Concept'])
                }
            end = time.time()
            print("Time getting the clusters of each cue:",end - start)

        return KMeans_categories, KMeans_dict
    
    def get_distances(self, test):
        ...


In [30]:
proc = PLT_processor(0)
cat, dictionary = proc.modelTrain(train, 500)
dictionary

K-means training process: 26.542444467544556
Accuracy training:  0.5071857852103475


{0: ['correr', 'oscuridad', 'advertencia', 'despedida', 'marginal'],
 1: ['cargante', 'interno', 'sutil', 'ocultar'],
 2: ['principio', 'entrenamiento', 'mejorar'],
 3: ['entendimiento',
  'ayuda',
  '',
  'a',
  'gratitud',
  'discapacidad',
  'manipulación',
  'placer',
  'aspiraciones',
  'características',
  'incerteza',
  'o partes',
  'herramienta',
  'ayuda',
  'ayuda',
  '',
  'agobiar',
  'ayuda',
  'personal',
  '',
  'ayuda',
  'agradable',
  'logro',
  'capacidad',
  '',
  'gratitud',
  'riesgo'],
 4: ['dolor',
  'sensibilidad',
  'subconciente',
  'compasión',
  'sufrimiento',
  'alivio',
  'dolor',
  'empatía',
  'sudor'],
 5: ['diario',
  'anticipación',
  'proyecto',
  'idea',
  'importante',
  'idea',
  'apropiado',
  'culmine'],
 6: ['idea',
  'consentimiento',
  'idea',
  'preferencia',
  'idea',
  'idea',
  'propuesta'],
 7: ['constricción',
  'nerviosismo',
  'nerviosismo',
  'nerviosismo',
  'nerviosismo',
  'nerviosismo'],
 8: ['cambio corporal',
  'corazón',
  '

In [31]:
cat

{0: 'correr',
 1: 'cargante',
 2: 'principio',
 3: 'ayuda',
 4: 'dolor',
 5: 'idea',
 6: 'idea',
 7: 'nerviosismo',
 8: '',
 9: 'virtud',
 10: 'futuro',
 11: 'rápidez',
 12: 'riesgo',
 13: 'gratitud',
 14: 'pensamiento',
 15: 'alegría',
 16: 'empatía',
 17: 'aceptación',
 18: 'crítica',
 19: 'desafío',
 20: 'cualidad',
 21: 'deseo',
 22: 'angustia',
 23: 'cerebro',
 24: 'búsqueda',
 25: 'negativo',
 26: 'concejo',
 27: 'trato',
 28: 'justificación',
 29: 'tratado',
 30: 'común',
 31: 'advertencia',
 32: 'cuidado',
 33: 'posibilidad',
 34: 'meta',
 35: '',
 36: 'logro',
 37: 'deber',
 38: 'lógica',
 39: 'tranquilidad',
 40: 'estrategia',
 41: 'destreza',
 42: 'explicación',
 43: '',
 44: 'facilidad',
 45: 'personal grupal',
 46: 'solución',
 47: 'mentir',
 48: 'transparencia',
 49: 'estrés',
 50: 'idea',
 51: 'protección',
 52: 'objetivos',
 53: 'razonamiento',
 54: 'estructura',
 55: 'justificación',
 56: 'mente',
 57: 'confianza',
 58: 'tranquilidad',
 59: 'trastorno',
 60: 'impacienc