In [2]:
import pandas as pd
import numpy as np
import spacy
import random
import es_core_news_sm

random.seed(0)

In [3]:
# data import
data = pd.read_excel(r'../data/Datos Codificados Estudio v2a.xlsx')

In [4]:
data_matrix = pd.concat(
    (data.iloc[:,2],    # Cue/Concept
    data.iloc[:,6],     # Codification
    data.iloc[:,3]      # Description of the cue
    ),
    axis = 1
)

In [5]:
data_matrix

Unnamed: 0,Palabra (concepto),Codificación,Descripción
0,Compasión,sentimiento,sentimiento
1,Compasión,perdón,por lo que se le perdona la vida a alguien en ...
2,plan,objetivos,necesario para cumplir objetivos
3,plan,organización,organizar recursos o personas
4,plan,estrategia,estrategia
...,...,...,...
4933,Obligación,imposición,imposicion
4934,Obligación,extricto,carácter extricto
4935,Obligación,normas,normas
4936,Obligación,deber,deberes


In [32]:
from sklearn.model_selection import KFold

kfold=KFold(n_splits=5)
counter = 1
for train_index, test_index in kfold.split(data_matrix):
    test = data_matrix.iloc[test_index,:]
    test.to_csv(r"../data/data_buchanan/test{}.csv".format(counter))
    counter+=1


In [43]:
buchanan_acc=[]
for i in range(5):
    test = pd.read_csv(r'../data/data_buchanan/test{}.csv'.format(i+1))
    test_result = pd.read_csv(r'../data/data_buchanan/resultados/multi.nostop.lemmas_test{}.csv'.format(i+1))

    len_test = len(test)
    counter = 0
    
    for i in range(len_test):
        counter += test['Codificación'][i] == np.random.choice(test_result['combined.lemmas'])
    buchanan_acc.append(counter/len_test)

np.mean(buchanan_acc)

0.0006076976401724442

In [6]:
# Corregir Buchanan 
# Sacar las codificaciones a partir de la descripcion en concreto
# EJ: descrip: sentimiento  =>  cod: sentimiento
from hunspell import Hunspell
import treetaggerwrapper
import regex


  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [12]:
class BuchananProcessor:

    def __init__(self, data):
        self.data = data.copy(deep=True)
        self.nlp = spacy.load('es_core_news_sm')
        self.h = Hunspell('es_CL', hunspell_data_dir='../dict')
        self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='es')

    def _sentenceCheck(self, text):
        """
        Recives a string and returns the same without special caracters and 
        the correct spelling of the string in spanish
        """
        # Remove all special caracters 
        words = [t.orth_ for t in self.nlp(text) if not (t.is_punct or t.orth_==" ")]
        tokens = [t.lower() for t in words]

        # Suggestion for each word bad spelled
        c_spell = [self.h.suggest(t)[0] if (self.h.spell(t)==False and len(self.h.suggest(t))>0) else t for t in tokens]

        # Re-join the words in one string
        return " ".join(c_spell)
        
    def _lemmatize(self,text):
        """
        Recives a string and return a list with the word, pos tagging and the lemma,
        for each word
        """
        return [tag for tag in treetaggerwrapper.make_tags(self.tagger.tag_text(text), exclude_nottags=True)]

    def spellCheck(self):
        """
        Check the spelling for each description.
        """
        for i in range(len(self.data.iloc[:,2])):
            self.data.iloc[i,2] = self._sentenceCheck(self.data.iloc[i,2]) 

    def lemmatization(self):
        """
        modify teh column Descripción to a list of tag for each word
        """
        self.data['Descripción'] = [self._lemmatize(self.data.iloc[i,2]) for i in range(len(self.data.iloc[:,2]))]

    def stopWordRemoval(self):
        """
        Removes al stop word from the column Descripción
        """
        for i in range(len(self.data['Descripción'])):
            self.data['Descripción'][i] = [word for word in self.data['Descripción'][i] if self.nlp(word[2])[0].is_stop==False]
    
    def multiWordSequence(self):
        """
        Find patterns from the 'Descripción' column, to create a new column of multiWordSeq
        patterns:
         * adverb-adjective
         * verb-noun
         * verb-adjective-noun
        """
        patterns = {
            "adv-adj": (r'ADV',r'ADJ'),
            "verb-noun":(r'V\w+', r'^(N(P|C))'),
            "verb-adj-noun":(r'V\w+', r'ADJ',r'^(N(P|C))')
        }
        
        # self.data['multiWordSeq']=np.nan
        for j in range(len(self.data['Descripción'])):
            l = self.data['Descripción'][j]
            temp = ["" for _ in range(15)]
            counter = 0
            # print(l)
            for _ in range(len(l)):
                temp[counter]=l[counter][2]
                counter+=1

            if len(l)>1:
                for i in range(len(l)-1):
                    if regex.match(patterns['adv-adj'][0],l[i][1]) and regex.match(patterns['adv-adj'][1],l[i+1][1]):
                        temp[counter]=" ".join((l[i][2],l[i+1][2]))
                        counter+=1  
                    if regex.match(patterns['verb-noun'][0],l[i][1]) and regex.match(patterns['verb-noun'][1],l[i+1][1]):
                        temp[counter]=" ".join((l[i][2],l[i+1][2]))
                        counter+=1  

            if len(l)>2:
                for i in range(len(l)-2):
                    if (regex.match(patterns['verb-adj-noun'][0],l[i][1]) and
                        regex.match(patterns['verb-adj-noun'][1],l[i+1][1]) and 
                        regex.match(patterns['verb-adj-noun'][2],l[i+2][1])
                    ):
                        temp[counter]=" ".join((l[i][2],l[i+1][2],l[i+2][2]))
                        counter+=1  
            # print(self.data['Descripción'][i],"=",temp)
            self.data['Descripción'][j]=temp
        
    def process(self):
        self.spellCheck()
        self.lemmatization()
        self.stopWordRemoval()
        self.multiWordSequence()

In [13]:
P = BuchananProcessor(data_matrix)
P.process()
P.data

Unnamed: 0,Palabra (concepto),Codificación,Descripción
0,Compasión,sentimiento,"[sentimiento, , , , , , , , , , , , , , ]"
1,Compasión,perdón,"[él|le, perdonar, vida, a, alguien, película, ..."
2,plan,objetivos,"[necesario, cumplir, objetivo, cumplir objetiv..."
3,plan,organización,"[organizar, recurso, o, persona, organizar rec..."
4,plan,estrategia,"[estrategia, , , , , , , , , , , , , , ]"
...,...,...,...
4933,Obligación,imposición,"[imposición, , , , , , , , , , , , , , ]"
4934,Obligación,extricto,"[carácter, extracto, , , , , , , , , , , , , ]"
4935,Obligación,normas,"[norma, , , , , , , , , , , , , , ]"
4936,Obligación,deber,"[deber, , , , , , , , , , , , , , ]"


In [None]:
P.data

Unnamed: 0,Palabra (concepto),Codificación,Descripción
0,Compasión,sentimiento,sentimiento
1,Compasión,perdón,por lo que se le perdona la vida a alguien en ...
2,plan,objetivos,necesario para cumplir objetivos
3,plan,organización,organizar recursos o personas
4,plan,estrategia,estrategia
...,...,...,...
4933,Obligación,imposición,imposicion
4934,Obligación,extricto,carácter extricto
4935,Obligación,normas,normas
4936,Obligación,deber,deberes


In [89]:
for l in P.data['lemma']:
    temp = 
    counter = 0
    for _ in range(len(l)):
        # print(l[counter][2])
        temp[counter]=l[counter][2]
        counter+=1
    print(temp)

['sentimiento', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['él|le', 'perdonar', 'vida', 'a', 'alguien', 'película', '', '', '', '', '', '', '', '', '']
['necesario', 'cumplir', 'objetivo', '', '', '', '', '', '', '', '', '', '', '', '']
['organizar', 'recurso', 'o', 'persona', '', '', '', '', '', '', '', '', '', '', '']
['estrategia', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['capacidad', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['diferenciar', 'persona', '', '', '', '', '', '', '', '', '', '', '', '', '']
['permitir', 'cumplir', 'meta', '', '', '', '', '', '', '', '', '', '', '', '']
['blando', 'o', 'duro', '', '', '', '', '', '', '', '', '', '', '', '']
['generar', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['utilidad', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['diferencia', 'costo', 'y', 'precio', 'venta', '', '', '', '', '', '', '', '', '', '']
['contrario', 'a', 'mentira', '', '', '', '', '', '', '', '', '', '

In [None]:
from sklearn.model_selection import KFold

kfold=KFold(n_splits=5)
for train_index, test_index in kfold.split(data_matrix):
    