# CPN generator

## Table of contents
1. [Libraries](##Libraries)
2. [Important Variables](##important-variables)
3. [Language Setting](#language-setting)
4. [Important functions](##important-functions)
5. [Data Cleaning](#data-cleaning)
6. [Word Embedding](##word-embedding)
7. [Model](#model)
8. [Codification Suggested](#codification-suggested)

## [Libraries](##Libraries)

In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
from collections import Counter 
import sklearn.cluster
from sklearn.model_selection import KFold
import random
from scipy.spatial import distance
import time

# Set of the random seed for the kmeans model
random.seed(0)

## [Important Variables](##important-variables)

In [36]:
pathTrainData = r'../your-route/train_data.csv' # Path of the training data
pathData = r'../your-route/properties-without-code.csv' # Path of the data to codify
numberCluster = 500 # Number of clusters for the k-means model
numberCodes = 3 # Number of code you want as suggestion.
outputFile = r'../your-route/suggest_codification.csv' # Path for the results of the AC-PLT

## [Language Setting](#language-setting)

In [6]:
#######Cambiar los archivos csv a delimeter=','

# Data import
# Relative Path of the dataset, change for your dataset
data_train = pd.read_csv(pathTrainData, delimiter=",")

## Import of the data to codify
# data_to_code = pd.read_csv(pathData, delimiter=",")


# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    r'../data/SBW-vectors-300-min5.bin.gz', # Relative path of the vector
    binary=True # The model is in binary format
)

# Import of the model of the word2vec-google-news-300 for English dataset
# model = gensim.models.KeyedVectors.load_word2vec_format( 
#     r'../../data/word2vec-google-news-300.model', # Relative path of the vector
#     binary=False # The model is in binary format
# )

In [53]:
# This is used as example, ignore when you use it.

from sklearn.model_selection import train_test_split

# Data import
data = pd.read_csv(pathTrainData, delimiter=",")

# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    r'../data/SBW-vectors-300-min5.bin.gz', # Relative path of the vector
    binary=True # The model is in binary format
)

pathTrainData=r'../data/CPN120/CPN120.csv'
outputFile = r'../data/CPN120/suggest_CPN120.csv'


data_train, data_to_code = train_test_split(data, test_size=0.2)

In [7]:
# The most important is the 3 rows of Concept, Feature and Code.
# Code is important for the training of the model.
data.head(5)

Unnamed: 0,Concepto,Respuesta,Codigo
0,granito,tierra,terrestre
1,granito,rugoso,texturas
2,granito,construcción,material_construccion
3,granito,desagradable al tacto,texturas
4,granito,raspa,lastimar


Installation of the Spanish pipeline. Change depending on the language you are using it.

More info in the page of spacy.

In [8]:
# Download each one depending on your language preference

## English
# !python -m spacy download en_core_news_sm

## Spanish
# !python -m spacy download es_core_news_sm


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 245.8 kB/s eta 0:00:52
     --------------------------------------- 0.0/12.8 MB 245.8 kB/s eta 0:00:52
     --------------------------------------- 0.0/12.8 MB 245.8 kB/s eta 0:00:52
     --------------------------------------- 0.1/12.8 MB 385.0 kB/s eta 0:00:33
     --------------------------------------- 0.1/12.8 MB 385.0 kB/s eta 0:00:33
     --------------------------------------- 0.1/12.8 MB 312.9 kB/s eta 0:00:41
      -------------------------------------- 0.2/12.8 MB 452.9 kB/s eta 0:00:28
      -------------------------------------- 0.2

In [8]:
# Configuration of spacy for Spanish
nlp = spacy.load('es_core_news_sm')


# Configuration of spacy for English
# nlp = spacy.load('en_core_news_sm')

## [Important functions](##important-functions)

In [9]:
def clean_text(text):
    """
    Recive a string and return the string in tokens without punctuations
    and in lowercase
    """
    # for each token in the sentence add to the list if is not a punctuation
    return [t for t in nlp(text.lower()) if not t.is_punct]


def normalize(tokens):
    """
    Recive al list of string and return in one string without stop words
    """
    # for each token if is not a stop word add the word to the list
    words = [t.orth_ for t in tokens if not t.is_stop]

    # return the tokens in one string
    return(" ".join(words))

# Just in case you want to use it replace for normalize
def lematize(tokens):
    """
    Recive al list of string and return in one string without stop words 
    and Lematized
    """
    # for each token if is not a stop word add the lemma of the word in the list
    lemmas = [t.lemma_ for t in tokens if not t.is_stop]

    # return the tokens in one string
    return(" ".join(lemmas))


def to_vector(texto, model):
    """ 
    Receives a sentence string along with a word embedding model and 
    returns the vector representation of the sentence
    """
    tokens = texto.split() # splits the text by space and returns a list of words
    vec = np.zeros(300) # creates an empty vector of 300 dimensions
    for word in tokens: # iterates over the sentence
        if word in model: # checks if the word is both in the word embedding
            vec += model[word] # adds every word embedding to the vector
    return vec / np.linalg.norm(vec) if np.linalg.norm(vec)>0 else vec # divides the vector by their normal

## [Data Cleaning](##data-cleaning)

In [56]:
start = time.time()
# Normalize of the feature
for i in range(len(data_train)):
    data_train.iloc[i,1] = normalize(clean_text(data_train.iloc[i,1])) # Change for the name of your dataset

for i in range(len(data_to_code)):
    data_to_code.iloc[i,1] = normalize(clean_text(data_to_code.iloc[i,1])) # Change for the name of your dataset
end = time.time()
print("Exceution time:", end-start)

Exceution time: 102.67801928520203


## [Word Embedding](##word-embedding)

In [57]:
# Timer
start = time.time()


# Creation of the matrix to make the clustering process
descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data_train), # the number of data points
        len(model['hola']) # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data_train.iloc[:,1]):
    vector = to_vector(description,model)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data_train], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:300]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Exceution time: 1.8939287662506104


In [58]:
# Timer
start = time.time()


# Creation of the matrix to make the clustering process
descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data_to_code), # the number of data points
        len(model['hola']) # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data_to_code.iloc[:,1]):
    vector = to_vector(description,model)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
matrix_to_code = np.concatenate([descriptions_matrix,data_to_code], axis=1)


# Remove of the 'Nan' data
matrix_to_code = matrix_to_code[~pd.isnull(matrix_to_code[:,:300]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Exceution time: 0.5245153903961182


## [Model](#model)

We create a class for the text classification, were initially you have to set the number of clusters you want to use for your model.

In [59]:
class AC_PLT:

    def __init__(self, n_clusters = 500, random_state=0):
        """
        n_clusters: number of cluster in the k-Means model
        """
        
        self.n_clusters = n_clusters # number of clusters
        self.KMeans_dict = {} # dictionary of all the humans codifications for each Cluster
        self.KMeans_categories = {} # dictionary for the most frecuent value in the centroid
        self.km = sklearn.cluster.KMeans(           # creates de k-means object
            n_clusters=self.n_clusters, 
            random_state=random_state,
            n_init='auto'
        ) 
        
        
    def most_frequent(self, List): 
        """
        Recives a list of words, and return the word most frequente of
        the list
        """
        # counter of occurence of a code in a list
        occurence_count = Counter(List) 
        
        # Return the first code with more occurence
        return occurence_count.most_common(1)[0][0] 


    def fit(self, train):
        """
        Recives the train dataset and the number of clusters to train 
        the k-means model
        """
        # Train the k-means algorithm
        self.km.fit(train[:,:300])

        # Dataframe of train dataset
        df = pd.DataFrame(
            np.concatenate([
                np.reshape(train[:,302], (-1, 1)),          # Human codification
                np.reshape(self.km.labels_, (-1, 1)),       # Number of the KMean centroid
                np.reshape(train[:,300], (-1, 1))           # Concept of the codification
                ], axis=1), 
            columns=['Human', 'KMeans', 'Concept'])

        # create a dictionary of all the humans codifications for each Cluster
        self.KMeans_dict = df.groupby(by='KMeans')['Human'].apply(list).to_dict()

        # Fill a dictionary with the most frecuent value in the centroid
        for key, val in self.KMeans_dict.items():
            self.KMeans_categories[key] = self.most_frequent(val)
        
        # Generates the prediction for the train dataset
        df['KM_Prediction'] = df['KMeans'].map(self.KMeans_categories)


    def get_distances(self, test):
        """
        recives the test data to calculate the distances of each frase, return 
        a matrix with the distances sorted
        """
        
        # Distance matrix of each test point to each cluster center
        distance_matrix = distance.cdist(test[:,:300].astype(float), self.km.cluster_centers_, 'euclidean')
        
        # Sorting distances
        self.topk=np.argsort(distance_matrix,axis=1)
        
    
    def set_labels(self):
        """
        Create a new matrix from the clusters sorted and change the value
        from numeric to the string according the codification
        """
        # Change of the numeric value to the codification 
        self.topKS=pd.DataFrame(self.topk)

        # create a temporal array of the kmeans categories
        tempData = np.array([value for (_, value) in sorted(self.KMeans_categories.items())])
        
        # print(tempData)

        # for each cluster center
        for j in range(self.topKS.shape[1]):
            # set the codification of the numeric value in the topk list
            self.topKS.iloc[:,j]=tempData[self.topk[:,j]]


    def get_accuracies(self, test):
        """
        Recives the test matrix and return the accuracies of the 
        diferents predictions
        """
        #Creating the accuracy table to check each data point
        testLabel=np.zeros(self.topKS.shape)
        indexes_method0=pd.DataFrame(np.zeros((self.topKS.shape[0],2)), columns=['index', 'value']) 

        #For each data point
        for i in range(testLabel.shape[0]):
            #Checking if some of the cluster is able to classify it right
            boolClass=self.topKS.iloc[i,:]==test[i,302]
            if sum(boolClass)>0:
                getIndex=boolClass.idxmax()
                indexes_method0.iloc[i,0] = getIndex
                indexes_method0.iloc[i,1] = self.topKS.iloc[i,getIndex]
                #Setting the rest of the data point as 1
                testLabel[i,getIndex:]=1
            else:
                indexes_method0.iloc[i,0] = np.nan
                indexes_method0.iloc[i,1] = np.nan
        accuracies=testLabel.sum(axis=0)/testLabel.shape[0]

        return accuracies

    
    def transform(self, test):
        """
        Recives two numpy bi-dimentionals arrays and returns the accuracy of the model
        """
        self.get_distances(test)
        self.set_labels()
        return self.get_accuracies(test)
    
    def suggestions(self, test, n_codes):
        self.get_distances(test)
        self.set_labels()
        return pd.DataFrame(
            np.concatenate([
                np.reshape(test[:, 300], (-1, 1)), 
                np.reshape(test[:, 301], (-1, 1)), 
                self.topKS.iloc[:, :n_codes]],
                axis=1
                ), 
            columns=['Concept', 'Description']+['top-{} suggestion'.format(i+1) for i in range(numberCodes)]
            )
        
    

## [Codification Suggested](#codification-suggested)

In [62]:
# Instance of the model
proc = AC_PLT(n_clusters=numberCluster)

# Train process
start = time.time()
proc.fit(data_matrix)
end = time.time()
print("Train process time:", end-start)

# creation of Dataframe with the suggested codes
start = time.time()
df = proc.suggestions(matrix_to_code, n_codes=numberCodes)
end = time.time()
print("Clasification process time:", end-start)

# Saving the dataframe into a csv file
df.to_csv(outputFile, index=False)

Train process time: 5.755473375320435
Clasification process time: 3.127873420715332
