In [61]:
traindoc = ['Julia loves me a lot more than Mary loves me',
'Julia likes me more than Mary loves me',
'Julian likes basketball more than baseball']

from collections import Counter

for doc in traindoc:
    tf = Counter()
    for word in doc.split():
        tf[word] +=1
    print (tf.items())

dict_items([('Julia', 1), ('loves', 2), ('me', 2), ('a', 1), ('lot', 1), ('more', 1), ('than', 1), ('Mary', 1)])
dict_items([('Julia', 1), ('likes', 1), ('me', 2), ('more', 1), ('than', 1), ('Mary', 1), ('loves', 1)])
dict_items([('Julian', 1), ('likes', 1), ('basketball', 1), ('more', 1), ('than', 1), ('baseball', 1)])


From the scikit-learn documentation:
Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols 
cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size 
rather than the raw text documents with variable length.
We will use CountVectorizer to "convert text into a matrix of token counts":

In [37]:

# Import vectorizers to turn text into numeric
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
# import CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [39]:
vect.fit(traindoc)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
vect.get_feature_names()

['baseball',
 'basketball',
 'julia',
 'julian',
 'likes',
 'lot',
 'loves',
 'mary',
 'me',
 'more',
 'than']

In [41]:
#transform training data into a 'document-term' matrix 
train1 = vect.transform(traindoc)
train1

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

From the scikit-learn documentation:

In this scheme, features and samples are defined as follows:
Each individual token occurrence frequency (normalized or not) is treated as a feature.
The vector of all the token frequencies for a given document is considered a multivariate sample.
A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) 
occurring in the corpus.
We call vectorization the general process of turning a collection of text documents into numerical feature vectors. 
This specific strategy (tokenization, counting and normalization) is called the Bag of Words or "Bag of n-grams" representation.
Documents are described by word occurrences while completely ignoring the relative position information of the words 
in the document.

In [42]:
#this converts a sparse matrix to a dense matrix
train1.toarray()

array([[0, 0, 1, 0, 0, 1, 2, 1, 2, 1, 1],
       [0, 0, 1, 0, 1, 0, 1, 1, 2, 1, 1],
       [1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [44]:
#examine the vocabulary and the document-term matrix together
import pandas as pd
pd.DataFrame(train1.toarray(), columns=vect.get_feature_names())

Unnamed: 0,baseball,basketball,julia,julian,likes,lot,loves,mary,me,more,than
0,0,0,1,0,0,1,2,1,2,1,1
1,0,0,1,0,1,0,1,1,2,1,1
2,1,1,0,1,1,0,0,0,0,1,1


In [45]:
# print the sparse matrix
print(train1)

  (0, 2)	1
  (0, 5)	1
  (0, 6)	2
  (0, 7)	1
  (0, 8)	2
  (0, 9)	1
  (0, 10)	1
  (1, 2)	1
  (1, 4)	1
  (1, 6)	1
  (1, 7)	1
  (1, 8)	2
  (1, 9)	1
  (1, 10)	1
  (2, 0)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (2, 9)	1
  (2, 10)	1


In [46]:
# now instead of count vectorizer we can use Tfidf vectorizer which normalizes data for different document sizes
vect2 = TfidfVectorizer()

In [47]:
vect2.fit(traindoc)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [49]:
vect2.fit(traindoc)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [51]:
vect2.get_feature_names()

['baseball',
 'basketball',
 'julia',
 'julian',
 'likes',
 'lot',
 'loves',
 'mary',
 'me',
 'more',
 'than']

In [53]:
train2 = vect2.transform(traindoc)
train2

<3x11 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [54]:
train2.toarray()

array([[ 0.        ,  0.        ,  0.27804511,  0.        ,  0.        ,
         0.36559591,  0.55609022,  0.27804511,  0.55609022,  0.21592683,
         0.21592683],
       [ 0.        ,  0.        ,  0.32957953,  0.        ,  0.32957953,
         0.        ,  0.32957953,  0.32957953,  0.65915906,  0.25594791,
         0.25594791],
       [ 0.48359121,  0.48359121,  0.        ,  0.48359121,  0.36778358,
         0.        ,  0.        ,  0.        ,  0.        ,  0.28561676,
         0.28561676]])

In [55]:
testdat=['Manju likes basketball too']

In [56]:
test1 = vect.transform(testdat)
test1.toarray()

array([[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [59]:
import numpy as np
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=2)
k_means.fit(train1)
print(train1.shape)
print(k_means.labels_)
predictor=k_means.predict(test1)
print (predictor)

(3, 11)
[1 1 0]
[0]


In [60]:
k_means = KMeans(n_clusters=2)
k_means.fit(simple_train2)
print(train2.shape)
print(k_means.labels_)
predictor=k_means.predict(test1)
print (predictor)

(3, 11)
[1 1 0]
[0]
