### Text Document Clustering using K-means.

In [3]:
import collections, nltk, re
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.datasets import load_files
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

# Tokenization and Stemming of Words
def tokenize(text):
    tokens = word_tokenize(text)
    stems = [PorterStemmer().stem(item) for item in tokens]
    return stems

# Compute Confusion Matrix and Purity
def purity_score(y_true, y_pred):
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    #purity
    return np.sum(np.amax(contingency_matrix, axis = 1)) / np.sum(contingency_matrix)
    
# Import Dataset
documents = pd.read_csv("bbc-text.csv")

# Label Encoding
labelEncoder = preprocessing.LabelEncoder()
documents['category'] = labelEncoder.fit_transform(documents['category'])

df = pd.DataFrame(list(zip(documents['text'], documents['category']))
                  ,columns = ['text', 'label'])

# Creating TF/IDF Vectors from Text
tfidfvectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english')
x = tfidfvectorizer.fit_transform(df.text.values)

# Clustering with Different K Values
for no_of_clusters in (3,5,7,9):
    # Build the K Means Model
    model = KMeans(n_clusters = no_of_clusters)

    # Train the Model 
    model.fit_transform(x)
    clusters = collections.defaultdict(list)
    for doc_id, label in enumerate(model.labels_):
        clusters[label].append(doc_id)
    purity = purity_score(y_true = df.label, y_pred = model.labels_)

    # Output: Performance Measures
    print("\nFor Number of Clusters = ", no_of_clusters)
    print("\n\tPurity: \t",purity)
    print("\tRecall: \t",metrics.recall_score(y_pred=model.labels_,
                                              y_true=df.label,average='macro'))
    print("\tPrecision: \t",metrics.precision_score(y_pred=model.labels_,
                                                    y_true=df.label,average='macro'))
    print("\tF-score: \t",metrics.f1_score(y_pred=model.labels_,
                                           y_true=df.label,average='macro'))


For Number of Clusters =  3

	Purity: 	 0.8943820224719101
	Recall: 	 0.22690365354775005
	Precision: 	 0.11377205343965438
	F-score: 	 0.151456872758047

For Number of Clusters =  5

	Purity: 	 0.8557303370786516
	Recall: 	 0.14892937200121675
	Precision: 	 0.206542005262187
	F-score: 	 0.16989975074055141

For Number of Clusters =  7

	Purity: 	 0.7051685393258427
	Recall: 	 0.1304491707522349
	Precision: 	 0.1469546456591608
	F-score: 	 0.1372836239997945

For Number of Clusters =  9

	Purity: 	 0.621123595505618
	Recall: 	 0.09267494898776624
	Precision: 	 0.2255795961904945
	F-score: 	 0.12925488103162971
