# Analyzing Documents

## Importamos librerías

In [2]:
import Bonche
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.cluster import KMeans

## Defino el problema a resolver

In [3]:
PROBLEMA = "problema1"

## Obtengo los datos tanto de prueba como validación

In [4]:
getdata = Bonche.ObtenerDatos(PROBLEMA)
clusters = getdata.ObtenerClusters(PROBLEMA)
X_train = getdata.Data_train(PROBLEMA)
X_valid = getdata.Data_valid(PROBLEMA)
Y_train = getdata.TargetTrain(PROBLEMA)
Y_valid = getdata.TargetValid(PROBLEMA)

## Imprimo forma del problema

In [9]:
print("Problema: ", PROBLEMA)
print("Num Clusters: ", clusters)
print("Num text for train: ", len(X_train))
print("Num text for valid: ", len(X_valid))
print("Num ground truth for train", len(Y_train))
print("Num ground truth for valid", len(Y_valid))

Problema:  problema1
Num Clusters:  35
Num text for train:  40
Num text for valid:  10
Num ground truth for train 40
Num ground truth for valid 10


## Inicializo el vectorizador

In [10]:
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3), max_df=0.5, max_features=100, norm='l2', min_df=1, stop_words='english', use_idf=True)

## Cargo mis datos de entrenamiento al vectorizador

In [11]:
X = vectorizer.fit_transform(X_train)

## Inicializo K-means

In [12]:
km = KMeans(n_clusters=clusters, init='k-means++', max_iter=100, n_init=1, verbose=False)

## Cargo mis datos vectorizados a k-means

In [13]:
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=35, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)

## Imprimo métricas de rendimiento

In [14]:
print("\nHomogeneity: %0.3f" % metrics.homogeneity_score(Y_train, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(Y_train, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(Y_train, km.labels_))
print("Silhouette Coefficient: %0.3f\n" %metrics.silhouette_score(X, km.labels_))


Homogeneity: 0.942
Completeness: 0.866
V-measure: 0.902
Silhouette Coefficient: 0.042



## Obtengo elementos para mostrar datos de importancia

In [15]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

## Imprimo elemento relevantes

In [16]:
for i in range(clusters):
	print("Cluster %d: "%i, end='')
	for ind in order_centroids[i, :10]:
		print(' %s ' %terms[ind], end='')
	print()

Cluster 0:  cot  sco  tis  epe  sal  oni  emb  vot  sts  rli 
Cluster 1:  iba  ats  alk  nit  fri  t's  ela   ru  it'   mp 
Cluster 2:  n’t  ’t   ’s   bla  rli  mmi  ela  tia  epe  iat 
Cluster 3:  iba  nit  lls  ism  ail  bal  vot  pas  nfe  muc 
Cluster 4:  ’s   orn  loc  tua  ’t   n’t  201  tch  bat  bor 
Cluster 5:  gg   egg  n't  't   ib   t's   ru   ed   au  nit 
Cluster 6:  oal  coa  ib   muc  tia  rva  sts  nt,  vis  tru 
Cluster 7:  egg  tax  gg   ib   ibe  vie  sal  ax    fl  iat 
Cluster 8:  ibe  ocr  moc  coa  ats  oal  rva  iam  rsh  soc 
Cluster 9:  bla  ark  tia  n't  acc  't   cot  orn  epe  t's 
Cluster 10:  bat   mp  ben  lls  ocr  roo  moc  't   ker  n't 
Cluster 11:  tua  vis  nfe  lve  spi  r's  't   n't  ora  tis 
Cluster 12:  ib   vot  't   n't  dy    ru  ats  egg  nom  loc 
Cluster 13:  iam  rli  xt   tis  ’t   n’t  cot  usi  sco  sts 
Cluster 14:  mmi  t's  riv  acc   uk  icy  vie  tia  bat  sha 
Cluster 15:   uk   mp  ’s   roc  mmi  cro  vot  uth  bat  roo 
Cl