In [1]:
import nltk
import numpy as np
import pandas as pd
import spacy
import sys
import os

from collections import defaultdict
from nltk import sent_tokenize
from tqdm import tqdm_notebook
from preprocessing1 import build_cooccurrence_matrix

#nltk.download("punkt")

In [2]:
def conll_iterator(file):
    context = []
    with open(file, "r") as fh:
        for line in fh:
            if line.strip() == "":
                yield context
                context = []
            elif line.strip() == ("=" * 80):
                continue
            else:
                (token, lemma, pos, dep, head,verificador_entidad) = line.strip().split()
                
                context.append({
                    "token": token,
                    "lemma": lemma,
                    "pos": pos,
                    "dep": dep,
                    "head": head,
                    "verificador_entidad": verificador_entidad
                })

In [3]:
def group_list_of_entities(list_of_entities):
    entity_group = []
    
    for entity in list_of_entities:
        if len(entity_group) == 0:
            # Si no hay nadie como parte del grupo, entonces agregar
            # la entidad actual como primer elemento
            entity_group.append(entity)
        elif ((entity[0] - entity_group[-1][0] == 1) and
              (entity[1]["verificador_entidad"] == entity_group[-1][1]["verificador_entidad"])):
            # Si el indice en el contexto de la palabra actual es continuo al de la
            # ultima palabra del grupo de entidades, y son del mismo tipo de entidad
            # entonces se agrega la palabra al grupo
            entity_group.append(entity)
        else:
            # En caso contrario hay un cambio de entidades, devuelvo el grupo
            # actual y lo reseteo con la entidad actual que es nueva
            yield entity_group
            entity_group = [entity]
    
    if len(entity_group) != 0:
        # Si al finalizar el recorrido de las entidades todavia hay acumulada
        # informacion de un grupo de entidades, entonces devolverla
        yield entity_group

In [4]:
entities_features = {}

for fname in os.listdir("infoleg_anotaciones"):
    # Para cada archivo conll de mi directorio corro el iterador
    conll_file = os.path.join("infoleg_anotaciones", fname)

    for context in conll_iterator(conll_file):
        # Se busca la lista de entidades
        list_of_entities = [
            (idx, word) for idx, word in enumerate(context) 
            if word["verificador_entidad"] != "O"
        ]

        if len(list_of_entities) == 0:
            # Si el contexto no tiene entidades, se lo ignora
            continue

        for entity_group in group_list_of_entities(list_of_entities):
            # Por cada grupo de entidades (formado por 1 o mas palabras)
            # genero features para dicha entidad
            
            # Lo primero es buscar el nombre de la entidad en mi diccionario
            # de features de entidades, y si no existe crearlo
            entity_lemma = ""
            for idx, word in entity_group:
                entity_lemma += word["lemma"] + " "
            entity_lemma = entity_lemma.strip().lower()

            if entity_lemma in entities_features:
                entity_features = entities_features[entity_lemma]
            else:
                entity_features = {}

            # Por cada una de las palabras de mi entidad genero una
            # serie de features con informacion de esas palabras
            features = []
            for idx, word in enumerate(entity_group):
                features.append("token__{}__{}".format(idx, word[1]["token"]))
                features.append("pos__{}__{}".format(idx, word[1]["pos"]))
                features.append("dep__{}__{}__{}".format(idx, word[1]["dep"], word[1]["head"]))

            # Por cada uno de los features creados, le sumo 1 a la cantidad de ocurrencias
            for feature in features:
                if feature not in entity_features:
                    entity_features[feature] = 0
                entity_features[feature] += 1

            start_idx = entity_group[0][0]
            end_idx = entity_group[-1][0]

            # Si la entidad no esta al inicio de la oracion busco informacion
            # de la palabra inmediatamente anterior al inicio de la entidad
            if start_idx > 0:
                previous_word = context[start_idx-1]
                previous_word_lemma = context[start_idx-1]["lemma"]
                features = [
                    "previous_token__{}".format(previous_word["token"]),
                    "previous_pos__{}".format(previous_word["pos"]),
                    "previous_lemma__{}".format(previous_word_lemma)
                ]

                for feature in features:
                    if feature not in entity_features:
                        entity_features[feature] = 0
                    entity_features[feature] += 1

            # Similar al caso anterior, pero con la palabra inmediatamente posterior
            if end_idx < len(context) - 1:
                next_word = context[end_idx+1]
                next_word_lemma = context[end_idx+1]["lemma"]
                features = [
                    "next_token__{}".format(next_word["token"]),
                    "next_pos__{}".format(next_word["pos"]),
                    "next_lemma__{}".format(next_word_lemma)
                ]

                for feature in features:
                    if feature not in entity_features:
                        entity_features[feature] = 0
                    entity_features[feature] += 1

            # Actualizo mi diccionario de features de entidades
            entities_features[entity_lemma] = entity_features

In [5]:
entity_to_index = {entity_name: index 
                   for index, entity_name in enumerate(sorted(entities_features))}

In [6]:
list_of_entities_features = [
    entities_features[entity_name] for entity_name in sorted(entities_features)
]

In [7]:
list_of_entities_features[entity_to_index["24.043"]] == entities_features["24.043"]

True

In [8]:
# Llamar al dict vectorizer sobre list_of_entities_features
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
matrix_of_entities = vectorizer.fit_transform(list_of_entities_features)

In [9]:
# Acceder a la representacion vectorial de la entidad "24.043"
matrix_of_entities[entity_to_index["24.043"]].toarray()

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 

In [10]:
import vsm1 as vsm

infoleg_oe = vsm.observed_over_expected(matrix_of_entities)


In [11]:
infoleg_ppmi = vsm.pmi(matrix_of_entities, positive=True)

In [12]:
pd.DataFrame(infoleg_ppmi, index=entity_to_index.keys())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,405,406,407,408,409,410,411,412,413,414
. 56/09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24.043,5.436629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"24.411 ,",0.0,0.0,0.0,0.0,0.0,5.148947,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
art.1 ° de lo resolución n ° 3227/2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.232656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
artículo . 1 ° de lo ley n ° 26.178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.050335,0.0,...,0.0,4.050335,0.0,0.0,0.0,0.0,4.050335,0.0,4.050335,0.0
codigo sanitario para los animales terrestres de lo organizacion mundial de sanidad animal ( oie ),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.702028,3.702028,0.0,0.0,0.0,0.0,3.702028
"código civil ,",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decreto n ° 27/2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decreto n ° 565/2008,0.0,0.0,0.0,0.0,4.743482,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decreto n ° 903/2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.cluster import KMeans

def clustering2(k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(infoleg_ppmi)
    clusters = kmeans.predict(infoleg_ppmi)
    return clusters

In [14]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(infoleg_oe)

kmeans.labels_

array([5, 8, 2, 1, 1, 1, 9, 1, 1, 1, 1, 1, 7, 4, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 6, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0], dtype=int32)

In [15]:
c10= clustering2(10)
printer = [word for word in entity_to_index if c10[entity_to_index[word]] == c10[entity_to_index["ley n ° 26.993"]]]
print(printer)

['. 56/09', '24.411 ,', 'código civil ,', 'decreto n ° 27/2018', 'decreto n ° 565/2008', 'decreto n ° 903/2014', 'decreto nacional nº 270/97', 'ley n ° 26.361', 'ley n ° 26.993', 'ley n ° 26.994', 'ley n ° 27.077', 'ley n ° 27.250', 'ley n ° 27.265', 'ley n ° 27.266', 'nota externa n ° 40/2009', 'nota externa n ° 63/2009', 'resolución general n ° 2040/2006', 'resolución general n ° 3397/2012', 'resolución general n ° 3880/2016', 'resolución general n ° 3984/2017', 'resolución general n ° 4035/2017', 'resolución general nº 2755/2010', 'resolución n ° 1/2018', 'resolución n ° 1427/2009', 'resolución n ° 15/2009', 'resolución n ° 17/2010', 'resolución n ° 30/2012', 'resolución n ° 80/2009', 'resolución n ° 91/2011', 'resolución nº 28/2011']


In [16]:

def clustering(k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(matrix_of_entities)
    clusters = kmeans.predict(matrix_of_entities)
    return clusters

In [17]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(matrix_of_entities)
kmeans.labels_

array([8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 3, 8, 8, 9, 1, 0, 0, 0, 2, 2, 2, 8,
       5, 5, 8, 8, 8, 6, 6, 8, 7, 4, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2,
       8], dtype=int32)

In [18]:
c10= clustering(10)
printer = [word for word in entity_to_index if c10[entity_to_index[word]] == c10[entity_to_index["ley n ° 26.993"]]]
print(printer)

['ley n ° 26.993', 'ley n ° 26.994', 'ley n ° 27.077']


In [19]:
printer = [word for word in entity_to_index if c10[entity_to_index[word]] == c10[entity_to_index["artículo . 1 ° de lo ley n ° 26.178"]]]
print(printer)

['. 56/09', '24.043', '24.411 ,', 'art.1 ° de lo resolución n ° 3227/2005', 'artículo . 1 ° de lo ley n ° 26.178', 'codigo sanitario para los animales terrestres de lo organizacion mundial de sanidad animal ( oie )', 'código civil ,', 'decreto nacional nº 270/97', 'escritural nº 11 - 85 - 3807/80—', 'ley nº 24.568', 'ley nº 25.054 ,', 'ley nº 25.054 .', 'ley nº 25.660 .', 'partir 5.1.7 . 2071 ,', 'resolución nº 91/11 ( dga ) ,']


In [20]:
c20= clustering(20)
printer = [word for word in entity_to_index if c20[entity_to_index[word]] == c20[entity_to_index["ley n ° 26.993"]]]
print(printer)

['ley n ° 26.993']


In [21]:
printer = [word for word in entity_to_index if c20[entity_to_index[word]] == c20[entity_to_index["artículo . 1 ° de lo ley n ° 26.178"]]]
print(printer)

['artículo . 1 ° de lo ley n ° 26.178']


In [22]:
c5= clustering(5)
printer = [word for word in entity_to_index if c10[entity_to_index[word]] == c10[entity_to_index["ley n ° 26.993"]]]
print(printer)

['ley n ° 26.993', 'ley n ° 26.994', 'ley n ° 27.077']


In [23]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(matrix_of_entities)
kmeans.labels_

array([12, 12, 12, 12, 19, 16, 11,  0,  0, 18,  2,  0, 12, 17,  1, 10,  7,
        3,  0,  0,  0,  0,  5, 13, 11, 11, 11, 14, 14, 11,  8,  4,  9,  9,
        9,  9, 15, 15,  6,  6, 15,  6,  6,  6,  6], dtype=int32)