In [111]:
# import modules
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [112]:
# load data
df = pd.read_csv('./description.csv', encoding='latin1')

In [113]:
df.head()

Unnamed: 0,book_id,name,description
0,4833.0,The Glass Castle,"A tender, moving tale of unconditional love in..."
1,590.0,"Night (The Night Trilogy, #1)","Born into a Jewish ghetto in Hungary, as a chi..."
2,4264.0,"Angela's Ashes (Frank McCourt, #1)",Imbued on every page with Frank McCourt's asto...
3,3361.0,"Eat, Pray, Love","A celebrated writer's irresistible, candid, an..."
4,4535.0,Into Thin Air: A Personal Account of the Mount...,A bank of clouds was assembling on the not-so-...


In [114]:
# get documents to be clustered
documents = df['description']

In [115]:
documents

0      A tender, moving tale of unconditional love in...
1      Born into a Jewish ghetto in Hungary, as a chi...
2      Imbued on every page with Frank McCourt's asto...
3      A celebrated writer's irresistible, candid, an...
4      A bank of clouds was assembling on the not-so-...
                             ...                        
138    From Galileo to todays amateur astronomers, s...
139    Once you have had a wonderful dog, a life with...
140    From the wild salmon caught in the Yukon river...
141    This Edition of Donatelles text provides stud...
142    Chef AJ has a unique ability to create healthy...
Name: description, Length: 143, dtype: object

In [116]:
# drop all missing values
documents.dropna(how='all', inplace=True)

In [117]:
test_data = documents.tail()

In [118]:
train_data = documents.drop(test_data.index)

In [119]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(train_data)

In [120]:
terms = vectorizer.get_feature_names_out() # get terms

In [121]:
len(terms)

4063

In [122]:
k = 5

In [123]:
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) # create cluster model

In [124]:
model.fit(X)

In [125]:
order_centriods = model.cluster_centers_.argsort()[:, ::-1]

In [126]:
order_centriods

array([[1672,  449, 2471, ..., 2468, 2466,    0],
       [ 449, 2785, 1871, ..., 1988, 1987, 2031],
       [1394, 2761, 2966, ..., 2512, 2511, 2031],
       [2142, 1651,  596, ..., 2481, 2480,    0],
       [ 634,  636, 3485, ..., 1947, 1946, 4062]])

In [127]:
for i in range(len(order_centriods)):
    print("Cluster:", i)
    for j in order_centriods[i][:5]:
        print("\t", terms[j])
    print()

Cluster: 0
	 guide
	 book
	 need
	 issues
	 tracks

Cluster: 1
	 book
	 potato
	 includes
	 techniques
	 dyslexia

Cluster: 2
	 family
	 political
	 readers
	 extraordinary
	 blake

Cluster: 3
	 life
	 greatest
	 century
	 time
	 story

Cluster: 4
	 child
	 children
	 story
	 surgery
	 years



In [128]:
pred = model.predict((vectorizer.transform(test_data)))

In [129]:
pred

array([3, 3, 1, 0, 1], dtype=int32)