In [1]:
import pandas as pd
import numpy as np
import spacy
import random
# import es_core_news_sm
from hunspell import Hunspell
import treetaggerwrapper
import regex
from sklearn.model_selection import KFold


random.seed(0)

  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [2]:
# data import
data = pd.read_csv(r'../../data/nueva_data/propiedades_codigo.csv', delimiter="\t")


In [3]:
data_matrix = pd.concat(
    (data.iloc[:,0],    # Cue/Concept
    data.iloc[:,2],     # Codification
    data.iloc[:,1]      # Description of the cue
    ),
    axis = 1
)

In [4]:
data_matrix

Unnamed: 0,Concepto,Codigo,Respuesta
0,granito,terrestre,tierra
1,granito,texturas,rugoso
2,granito,material_construccion,construcción
3,granito,texturas,desagradable al tacto
4,granito,lastimar,raspa
...,...,...,...
31859,aptitud,inteligencia,inteligencia
31860,aptitud,necesario,condicion necesaria
31861,aptitud,personalidad,caracter
31862,aptitud,personalidad,personalidad


In [7]:
class BuchananProcessor:

    def __init__(self, data):
        self.data = data.copy(deep=True)
        self.nlp = spacy.load('es_core_news_sm')
        self.h = Hunspell('es_CL', hunspell_data_dir='../../dict')
        self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='es')

    def _sentenceCheck(self, text):
        """
        Recives a string and returns the same without special caracters and 
        the correct spelling of the string in spanish
        """
        # Remove all special caracters 
        words = [t.orth_ for t in self.nlp(text) if not (t.is_punct or t.orth_==" ")]
        tokens = [t.lower() for t in words]

        # Suggestion for each word bad spelled
        c_spell = [self.h.suggest(t)[0] if (self.h.spell(t)==False and len(self.h.suggest(t))>0) else t for t in tokens]

        # Re-join the words in one string
        return " ".join(c_spell)
        
    def _lemmatize(self,text):
        """
        Recives a string and return a list with the word, pos tagging and the lemma,
        for each word
        """
        return [tag for tag in treetaggerwrapper.make_tags(self.tagger.tag_text(text), exclude_nottags=True)]

    def spellCheck(self):
        """
        Check the spelling for each description.
        """
        for i in range(len(self.data.iloc[:,2])):
            self.data.iloc[i,2] = self._sentenceCheck(self.data.iloc[i,2]) 

    def lemmatization(self):
        """
        modify teh column Descripción to a list of tag for each word
        """
        self.data['Respuesta'] = [self._lemmatize(self.data.iloc[i,2]) for i in range(len(self.data.iloc[:,2]))]

    def stopWordRemoval(self):
        """
        Removes al stop word from the column Descripción
        """
        for i in range(len(self.data['Respuesta'])):
            self.data['Respuesta'][i] = [word for word in self.data['Respuesta'][i] if self.nlp(word[2])[0].is_stop==False]
    
    def multiWordSequence(self):
        """
        Find patterns from the 'Descripción' column, to create a new column of multiWordSeq
        patterns:
         * adverb-adjective
         * verb-noun
         * verb-adjective-noun
        """
        patterns = {
            "adv-adj": (r'ADV',r'ADJ'),
            "verb-noun":(r'V\w+', r'^(N(P|C))'),
            "verb-adj-noun":(r'V\w+', r'ADJ',r'^(N(P|C))')
        }
        
        # self.data['multiWordSeq']=np.nan
        for j in range(len(self.data['Respuesta'])):
            l = self.data['Respuesta'][j]
            temp = [word[2] for word in l]

            if len(l)>1:
                for i in range(len(l)-1):
                    if regex.match(patterns['adv-adj'][0],l[i][1]) and regex.match(patterns['adv-adj'][1],l[i+1][1]):
                        temp.append(" ".join((l[i][2],l[i+1][2])))

                    if regex.match(patterns['verb-noun'][0],l[i][1]) and regex.match(patterns['verb-noun'][1],l[i+1][1]):
                        temp.append(" ".join((l[i][2],l[i+1][2])))


            if len(l)>2:
                for i in range(len(l)-2):
                    if (regex.match(patterns['verb-adj-noun'][0],l[i][1]) and
                        regex.match(patterns['verb-adj-noun'][1],l[i+1][1]) and 
                        regex.match(patterns['verb-adj-noun'][2],l[i+2][1])
                    ):
                        temp.append(" ".join((l[i][2],l[i+1][2],l[i+2][2])))

            self.data['Respuesta'][j]=temp
        
    def process(self):
        self.spellCheck()
        self.lemmatization()
        self.stopWordRemoval()
        self.multiWordSequence()

In [8]:
P = BuchananProcessor(data_matrix)
P.process()
P.data

Unnamed: 0,Concepto,Codigo,Respuesta
0,granito,terrestre,[tierra]
1,granito,texturas,[rugoso]
2,granito,material_construccion,[construcción]
3,granito,texturas,"[desagradable, tacto]"
4,granito,lastimar,[raspa]
...,...,...,...
31859,aptitud,inteligencia,[inteligencia]
31860,aptitud,necesario,"[condición, necesario]"
31861,aptitud,personalidad,[carácter]
31862,aptitud,personalidad,[personalidad]


In [9]:
buchanan_acc = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    test = P.data.iloc[test_index,:]
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        if len(test.iloc[i,2])>0:
            counter += test.iloc[i,1] == np.random.choice(test.iloc[i,2])
    buchanan_acc[j]=(counter/len_test)
    j+=1

In [10]:
print(buchanan_acc.mean())
print(buchanan_acc.std())

0.1611538851374812
0.005434623644154913


In [11]:
buchanan_acc_top3 = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    test = P.data.iloc[test_index,:]
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        if len(test.iloc[i,2])>0:
            for _ in range(3):
                val = test.iloc[i,1] == np.random.choice(test.iloc[i,2])
                if val:
                    counter += val
                    break
    buchanan_acc_top3[j]=(counter/len_test)
    j+=1

In [12]:
print(buchanan_acc_top3.mean())
print(buchanan_acc_top3.std())

0.1695018039951778
0.005360331345924188


In [13]:
buchanan_acc_top5 = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    test = P.data.iloc[test_index,:]
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        if len(test.iloc[i,2])>0:
            for _ in range(5):
                val = test.iloc[i,1] == np.random.choice(test.iloc[i,2])
                if val:
                    counter += val
                    break
    buchanan_acc_top5[j]=(counter/len_test)
    j+=1

print(buchanan_acc_top5.mean())
print(buchanan_acc_top5.std())

0.17304815247233873
0.0050296334506746265


In [14]:
buchanan_acc_top10 = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    test = P.data.iloc[test_index,:]
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        if len(test.iloc[i,2])>0:
            for _ in range(10):
                val = test.iloc[i,1] == np.random.choice(test.iloc[i,2])
                if val:
                    counter += val
                    break
    buchanan_acc_top10[j]=(counter/len_test)
    j+=1

print(buchanan_acc_top10.mean())
print(buchanan_acc_top10.std())

0.17543330310339966
0.005190773294644802


In [15]:
buchanan_acc_top50 = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    test = P.data.iloc[test_index,:]
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        if len(test.iloc[i,2])>0:
            for _ in range(50):
                val = test.iloc[i,1] == np.random.choice(test.iloc[i,2])
                if val:
                    counter += val
                    break
    buchanan_acc_top50[j]=(counter/len_test)
    j+=1

print(buchanan_acc_top50.mean())
print(buchanan_acc_top50.std())

0.1760609756181647
0.005045704353592308


In [16]:
n1 = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    test = P.data.iloc[test_index,:]
    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        if len(test.iloc[i,2])==1:
            counter += test.iloc[i,1] == np.random.choice(test.iloc[i,2])
    n1[j]=(counter/len_test)
    j+=1

In [17]:
n1.mean()

0.15020108963692463

In [18]:
n1.std()

0.0061520292154100395

In [19]:
equal_acc = np.zeros(5)
j = 0
kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    original_test = data_matrix.iloc[test_index,:]
    len_test = len(original_test)
    counter = 0
    
    for i in range(len_test):
        if len(original_test.iloc[i,2])>0:
            counter += original_test.iloc[i,1] == original_test.iloc[i,2]
    equal_acc[j]=(counter/len_test)
    j+=1

In [20]:
equal_acc.mean()

0.12518766149842167

In [21]:
equal_acc.std()

0.013137870161923722