In [1]:
import nltk
import numpy as np
import pandas as pd
import spacy
import sys
import os
import vsm1 as vsm

from collections import defaultdict
from nltk import sent_tokenize

nltk.download("punkt")
from tqdm import tqdm

[nltk_data] Downloading package punkt to /home/fernando/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from preprocessing1 import build_cooccurrence_matrix

In [3]:
#paso 1: Preprocesamiento del corpus
def corpus_iterator(corpus_file):
    document = {
        "title": None,
        "body": None
    }

    with open(corpus_file, "r") as fh:
        for line in fh:
            if line.strip() == "-":
                new_document = True
                document = {
                    "title": None,
                    "body": None
                }
            elif new_document:
                document["title"] = line.strip()
                new_document = False
            else:
                document["body"] = line.strip()
                yield document

In [4]:
#paso 1: Preprocesamiento del corpus
nlp = spacy.load("es")
nlp.Defaults.stop_words |= {"a","e",'y','o','u'}
with open("./lavozprocess.conll", "w") as fh:
    for document in tqdm(corpus_iterator("./lavoztextodump1.txt")):
        for sentence in sent_tokenize(document["body"], language="spanish"):
            for token in nlp(sentence):
                if len(token) == 1:
                    continue
                if token.is_stop:
                    continue
                if not token.is_alpha:
                    continue
                print(token.text.lower(), token.lemma_, token.pos_, token.dep_, token.head,
                      file=fh)
            print(file=fh)
        print("="*80, file=fh)
        
    

12947it [45:01,  4.92it/s]


In [5]:
#paso 1: Preprocesamiento del corpus
def conll_iterator(file):
    context = []
    with open(file, "r") as fh:
        for line in fh:
            if line.strip() == "":
                yield context
                context = []
            elif line.strip() == ("=" * 80):
                continue
            else:
                (token, lemma, pos, dep, head) = line.strip().split()
                context.append(token)

for corpus in conll_iterator("./lavozprocess.conll"):
    print(corpus)
    break

['clara', 'crespo', 'rodolfo', 'martínez', 'imaginan']


In [7]:
context = []
with open("./lavozprocess.conll", "r") as fh:
    for line in fh:
        if line.strip() == "":
            continue
        elif line.strip() == ("=" * 80):
            continue
        else:
            (token, lemma, pos, dep, head) = line.strip().split()
            context.append(token)

key_words = {}
wid = 0
for d in context:
    if len(d) > 0:
        key_words[d] = wid
        wid += 1

In [10]:
#paso 2: representación vectorial de las palabras y diseño de modelo de matriz
lavoz_5window_scaled = build_cooccurrence_matrix(
    corpus=conll_iterator("./lavozprocess.conll"),
    window_size=5,
    scale_factor="scaled",
    vocab_size=5000,
    unkown_vector=True
)



In [11]:
lavoz_5window_scaled.head()

Unnamed: 0,abajo,abandonar,abandono,abastecimiento,abierta,abiertas,abierto,abiertos,abogado,abogados,...,índices,ómnibus,órdenes,órgano,órganos,única,único,únicos,útil,UNK
abajo,0.5,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.25
abandonar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.25
abandono,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,1.0
abastecimiento,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333
abierta,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2


In [None]:
#paso 2: representación vectorial de las palabras y diseño de modelo de matriz
lavoz_20window_scaled = build_cooccurrence_matrix(
    corpus=conll_iterator("./lavozprocess.conll"),
    window_size=20,
    scale_factor="scaled",
    vocab_size=5000,
    unkown_vector=True
)

In [None]:
#paso 2: representación vectorial de las palabras y diseño de modelo de matriz

#fout= open('./lavozconlliterado.txt', 'w')
#for corpus in conll_iterator("./lavozprocess.conll"):
#    print(corpus, file=fout)


#from sklearn.feature_extraction import DictVectorizer
#v = DictVectorizer(sparse=False)
#matrix = v.fit_transform("./lavozconlliterado.txt")


In [None]:
#paso 3: medida de la distancia de las palabras 
#Explorar al vecino medainte distancia conseno
vsm.neighbors("Argentina", lavoz_5window_scaled, distfunc=vsm.cosine).head()

In [None]:
#paso 3: medida de la distancia de las palabras
#Explorar al vecino mediante diatancia euclídea
vsm.neighbors("Argentina", lavoz_5window_scaled, distfunc=vsm.euclidean).head()


In [None]:
#paso 3: medida de la distancia de las palabras
#método de reponderación observado/esperado
lavoz_oe = vsm.observed_over_expected(lavoz_5window_scaled)
vsm.neighbors("Argentina", lavoz_oe).head()

In [None]:
#paso 3: medida de la distancia de las palabras
#método de reponderación TfIDF
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(max_features=5000)
corpus = lavoz_5window_scaled
vectorized_corpus = vectorizer.fit_transform(corpus)

In [None]:
#vectorizer.vocabulary_

In [None]:
#paso 3: medida de la distancia de las palabras
#método de reponderación PMI
lavoz_ppmi = vsm.pmi(lavoz_5window_scaled, positive=True)
vsm.neighbors("Argentina", lavoz_ppmi).head()

In [13]:
#paso 3: medida de la distancia de las palabras
#método de reducción de dimensionalidad LSA
lavoz_lsa = vsm.lsa(lavoz_5window_scaled, k=100)
#vsm.neighbors("Argentina", lavoz_lsa).head()

In [14]:
from sklearn.cluster import KMeans

In [15]:
#paso 4: clusterizar

#from sklearn.cluster import KMeans


def clustering2(k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(lavoz_lsa)
    clusters = kmeans.predict(lavoz_lsa)
    return clusters


In [None]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(lavoz_lsa)

kmeans.labels_

In [None]:
c10= clustering2(10)
printer = [word for word in key_words if c10[key_words[word]] == c10[key_words["abrir"]]]
print(printer)


In [None]:
#paso 4: clusterizar
def clustering(k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(vectorized_corpus.T)
    clusters = kmeans.predict(vectorized_corpus.T)
    return clusters


In [16]:
c50 =  clustering2(50)
c50

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [17]:
printer = [word for word in key_words if c50[key_words[word]] == c50[key_words["internacional"]]]
print(printer)

IndexError: index 2376352 is out of bounds for axis 0 with size 5001

In [None]:
c150 =  clustering(150)
c150

In [None]:
printer = [word for word in vectorizer.vocabulary_ if c150[vectorizer.vocabulary_[word]] == c150[vectorizer.vocabulary_["internacional"]]]
print(printer)

In [None]:
c300 =  clustering(300)
c300

In [None]:
printer = [word for word in vectorizer.vocabulary_ if c300[vectorizer.vocabulary_[word]] == c300[vectorizer.vocabulary_["internacional"]]]
print(printer)

In [None]:
printer = [word for word in vectorizer.vocabulary_ if c50[vectorizer.vocabulary_[word]] == c50[vectorizer.vocabulary_["argentina"]]]
print(printer)

In [None]:
printer = [word for word in vectorizer.vocabulary_ if c150[vectorizer.vocabulary_[word]] == c150[vectorizer.vocabulary_["argentina"]]]
print(printer)

In [None]:
printer = [word for word in vectorizer.vocabulary_ if c300[vectorizer.vocabulary_[word]] == c300[vectorizer.vocabulary_["argentina"]]]
print(printer)