In [1]:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy



In [2]:
class autovivify_list(dict):
  '''A pickleable version of collections.defaultdict'''
  def __missing__(self, key):
    '''Given a missing key, set initial value to an empty list'''
    value = self[key] = []
    return value

  def __add__(self, x):
    '''Override addition for numeric types when self is empty'''
    if not self and isinstance(x, Number):
      return x
    raise ValueError

  def __sub__(self, x):
    '''Also provide subtraction method'''
    if not self and isinstance(x, Number):
      return -1 * x
    raise ValueError

In [3]:
def build_word_vector_matrix(vector_file, n_words):
  '''Return the vectors and labels for the first n_words in vector file'''
  numpy_arrays = []
  labels_array = []
  with codecs.open(vector_file, 'r', 'utf-8') as f:
    for c, r in enumerate(f):
      sr = r.split()
      labels_array.append(sr[0])
      numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )

      if c == n_words:
        return numpy.array( numpy_arrays ), labels_array

  return numpy.array( numpy_arrays ), labels_array

In [4]:
def find_word_clusters(labels_array, cluster_labels):
  '''Return the set of words in each cluster'''
  cluster_to_words = autovivify_list()
  for c, i in enumerate(cluster_labels):
    cluster_to_words[ i ].append( labels_array[c] )
  return cluster_to_words

In [6]:
if __name__ == "__main__":
  input_vector_file = "glove.42B.300d.txt" # Vector file input (e.g. glove.6B.300d.txt)
  n_words = 10000 # Number of words to analyze
  reduction_factor = .1 # Amount of dimension reduction {0,1}
  n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
  df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
  kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
  kmeans_model.fit(df)

  cluster_labels  = kmeans_model.labels_
  cluster_inertia   = kmeans_model.inertia_
  cluster_to_words  = find_word_clusters(labels_array, cluster_labels)

  for c in cluster_to_words:
    print(cluster_to_words[c])
    print("\n")

[',', '.', 'and', '"', 'for', 'on', '-', 'with', 'by', 'at', '...', 'all', 'more', ';', "'", '&', 'home', 'work', '--', 'long', 'set', 'full', 'working', 'special', 'together', 'share', 'along', 'short', 'works', 'field', 'fine']


['the', 'of', 'a', 'in', 'is', "'s", 'this', 'from', 'an', 'my', 'one', 'new', 'his', 'their', 'our', 'first', 'into', 'its', 'best', 'world', 'part', 'each', 'same', 'own', 'every', 'top', 'another', 'place', 'between', 'point', 'whole', 'main', 'entire']


['to', 'up', 'out', 'me', 'get', 'them', 'us', 'then', 'back', 'go', 'him', 'right', 'take', 'off', 'down', 'come', 'again', 'give', 'let', 'put', 'away', 'left', 'save', 'stop', 'leave', 'turn', 'move', 'follow', 'bring', 'rest', 'break']


[':', 'information', 'reviews', 'details', 'info', 'map', 'description', 'profile', 'id', 'zip', 'testimonials']


['i', '?', "n't", 'do', 'what', 'like', 'just', 'would', 'how', 'see', 'know', 'did', 'could', "'m", 'think', 'does', 'really', 'want', "'ve", 'why', "'