In [1]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.cluster import KMeans

In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [12]:
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()

for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 cat
 ninja
 climbing
 ve
 photo
Cluster 1:
 google
 feedback
 map
 app
 impressed
Cluster 2:
 kitten
 belly
 squooshy
 merley
 eating


In [14]:
print("Prediction")

Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Prediction
[1]


In [15]:
Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

[0]


In [17]:
model.cluster_centers_

array([[0.        , 0.        , 0.        , 0.19966128, 0.        ,
        0.45456823, 0.        , 0.30415658, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30415658, 0.        , 0.2382371 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.2382371 , 0.        , 0.2382371 ],
       [0.10756238, 0.13554052, 0.        , 0.        , 0.        ,
        0.        , 0.11915496, 0.        , 0.        , 0.11915496,
        0.10756238, 0.13554052, 0.31564425, 0.13554052, 0.13554052,
        0.11915496, 0.        , 0.        , 0.        , 0.13554052,
        0.        , 0.        , 0.10756238, 0.        , 0.        ,
        0.11915496, 0.        , 0.10756238, 0.        , 0.10756238,
        0.        , 0.13554052, 0.        ],
       [0.        , 0.        , 0.23057456, 0.19323947, 0.20412415,
        0.        , 0.    

In [19]:
print(X[0])

  (0, 18)	0.408248290463863
  (0, 17)	0.408248290463863
  (0, 4)	0.408248290463863
  (0, 24)	0.408248290463863
  (0, 8)	0.408248290463863
  (0, 26)	0.408248290463863


In [29]:
from gensim.models import Word2Vec

In [23]:
sentences = []

for d in documents:
    d = d.lower()
    d = d.replace('.', '')
    sentences.append(d.split(' '))

In [33]:
sentences

[['this',
  'little',
  'kitty',
  'came',
  'to',
  'play',
  'when',
  'i',
  'was',
  'eating',
  'at',
  'a',
  'restaurant'],
 ['merley', 'has', 'the', 'best', 'squooshy', 'kitten', 'belly'],
 ['google', 'translate', 'app', 'is', 'incredible'],
 ['if',
  'you',
  'open',
  '100',
  'tab',
  'in',
  'google',
  'you',
  'get',
  'a',
  'smiley',
  'face'],
 ['best', 'cat', 'photo', "i've", 'ever', 'taken'],
 ['climbing', 'ninja', 'cat'],
 ['impressed', 'with', 'google', 'map', 'feedback'],
 ['key', 'promoter', 'extension', 'for', 'google', 'chrome']]

In [46]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, min_count=1, size=4, window=5)

In [47]:
model.most_similar('google')

  """Entry point for launching an IPython kernel.


[('best', 0.9389210343360901),
 ('i', 0.8891955614089966),
 ('promoter', 0.8649388551712036),
 ('cat', 0.8393896818161011),
 ('face', 0.8013988137245178),
 ('impressed', 0.7743403911590576),
 ('the', 0.7255948185920715),
 ('when', 0.7229304313659668),
 ('kitty', 0.6918233036994934),
 ('ever', 0.6468441486358643)]

In [48]:
model.wv['google']

array([-0.02819589, -0.05960035, -0.06836702, -0.08240619], dtype=float32)