# Text clustering using TF-IDF vectorizer

## Step 1: Import the libraries

In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

## Step 2: Create the documents

In [2]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"]

## Step 3: Vectorize the dataset

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [6]:
X

<5x27 sparse matrix of type '<class 'numpy.float64'>'
	with 33 stored elements in Compressed Sparse Row format>

## Step 4: Perform clustering

k = 2
km = KMeans(n_clusters=k)
km.fit(X)

## Predict the clusters for each document

In [7]:
y_pred = km.predict(X)

## Display the document and its predicted cluster in a table

In [8]:
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0


## Print top terms per cluster

In [9]:
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" %i)
    for ind in order_centroids[i, :10]:
        print('%s' %terms[ind])
    print()


Top terms per cluster:
Cluster 0:
to
and
read
watch
movies
like
books
concerts
going
music

Cluster 1:
playing
the
weekends
on
football
video
sports
prefer
over
games



## Step 5: Evalute results

### Calculate purity

In [10]:
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6
