In [6]:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy



In [7]:
class autovivify_list(dict):
  '''A pickleable version of collections.defaultdict'''
  def __missing__(self, key):
    '''Given a missing key, set initial value to an empty list'''
    value = self[key] = []
    return value

  def __add__(self, x):
    '''Override addition for numeric types when self is empty'''
    if not self and isinstance(x, Number):
      return x
    raise ValueError

  def __sub__(self, x):
    '''Also provide subtraction method'''
    if not self and isinstance(x, Number):
      return -1 * x
    raise ValueError

In [8]:
def build_word_vector_matrix(vector_file, n_words):
  '''Return the vectors and labels for the first n_words in vector file'''
  numpy_arrays = []
  labels_array = []
  with codecs.open(vector_file, 'r', 'utf-8') as f:
    for c, r in enumerate(f):
      sr = r.split()
      labels_array.append(sr[0])
      numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )

      if c == n_words:
        return numpy.array( numpy_arrays ), labels_array

  return numpy.array( numpy_arrays ), labels_array

In [9]:
def find_word_clusters(labels_array, cluster_labels):
  '''Return the set of words in each cluster'''
  cluster_to_words = autovivify_list()
  for c, i in enumerate(cluster_labels):
    cluster_to_words[ i ].append( labels_array[c] )
  return cluster_to_words

In [10]:
if __name__ == "__main__":
  input_vector_file = "glove.42B.300d.txt" # Vector file input (e.g. glove.6B.300d.txt)
  n_words = 10000 # Number of words to analyze
  reduction_factor = .1 # Amount of dimension reduction {0,1}
  n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
  df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
  kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
  kmeans_model.fit(df)

  cluster_labels  = kmeans_model.labels_
  cluster_inertia   = kmeans_model.inertia_
  cluster_to_words  = find_word_clusters(labels_array, cluster_labels)

  for c in cluster_to_words:
    print(cluster_to_words[c])
    print("\n")

[',', '.', 'and', 'is', 'that', 'it', 'are', 'as', 'be', 'have', 'was', 'not', '...', 'we', 'but', 'they', 'has', 'so', 'there', ';', 'which', 'no', 'also', 'had', 'been', 'were', 'now', 'only', 'good', 'well', 'work', 'because', 'even', 'where', 'made', 'being', 'while', 'still', 'both', 'since', 'without', 'however', 'having', 'yet', 'already', 'called', 'although', 'fine']


['the', 'a', 'for', 'with', "'s", 'this', 'or', 'an', 'one', 'first', 'each', 'every', 'another', 'second', 'person', 'least', 'single']


['to', 'you', 'your', 'will', 'can', 'do', 'if', 'get', 'how', 'them', 'see', 'make', 'use', 'should', 'need', 'want', 'take', "'ll", 'please', 'must', 'give', 'let', 'add', 'check', 'try', 'needs']


['of', 'all', 'some', 'other', 'any', 'these', 'most', 'two', 'many', 'such', 'those', 'few', 'several', 'couple', 'certain', 'lots', 'plenty']


['in', 'on', 'by', 'from', 'their', 'our', 'its', 'used', 'part', 'same', 'own', 'set', 'show', 'full', 'number', 'between', 'name', 