## Importing dataset and necessary libraries

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import Normalizer    
from sklearn import metrics   
import numpy as np     

In [2]:
def load_dataset(a_set, cats):    #B 
    dataset = fetch_20newsgroups(subset=a_set, categories=cats, 
                                remove=('headers', 'footers', 'quotes'), 
                                shuffle=True) 
    return dataset 

categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", 
"rec.sport.baseball","rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", 
"talk.politics.mideast"] 

newsgroups_train = load_dataset('train', categories) 
newsgroups_test = load_dataset('test', categories)  

## Data Analysis

In [25]:
import random 
random.seed(42)    

# combine training and test data into a single list
all_news = list(zip(newsgroups_train.data, newsgroups_train.target)) 
all_news += list(zip(newsgroups_test.data, newsgroups_test.target))   
# shuffle data randomly
random.shuffle(all_news)    

# store labels and contents separately
all_news_data = [text for (text, label) in all_news] 
all_news_labels = [label for (text, label) in all_news]   

print("Data:") 
print(str(len(all_news_data)) + " posts in " 
     + str(np.unique(all_news_labels).shape[0]) + " categories\n")    #E 
print("Labels: ") 
print(all_news_labels[:10]) 
num_clusters = np.unique(all_news_labels).shape[0] 
print("Actual number of clusters: " + str(num_clusters))

Data:
9850 posts in 10 categories

Labels: 
[2, 6, 1, 9, 0, 5, 1, 2, 9, 0]
Actual number of clusters: 10


## Data Preprocessing

In [26]:


# ignore all words that occur in less than 2 documents or in 
# more than 50% documents. Also remote the stopwords and 
# apply inverse document frequency weights.
vectorizer = TfidfVectorizer(min_df=2, max_df=0.5,    
                            stop_words='english', 
                            use_idf=True)  

def transform(data, vectorizer, dimensions): 
    trans_data = vectorizer.fit_transform(data) 
    print("Transformed data contains: " + str(trans_data.shape[0]) + 
          " with " + str(trans_data.shape[1]) + " features =>")    
    
    # reduce the dimension of the data
    svd = TruncatedSVD(dimensions)   
    pipe = make_pipeline(svd, Normalizer(copy=False)) # normalizer helps adjust different ranges to same range
    reduced_data = pipe.fit_transform(trans_data)    

    return reduced_data, svd    

reduced_data, svd = transform(all_news_data, vectorizer, 300) 
print("Reduced data contains: " + str(reduced_data.shape[0]) + 
     " with " + str(reduced_data.shape[1]) + " features")   #H 

Transformed data contains: 9850 with 33976 features =>
Reduced data contains: 9850 with 300 features


## Model fitting

In [27]:
from sklearn.cluster import KMeans    

def cluster(data, num_clusters): 
    km = KMeans(n_clusters=num_clusters, init='k-means++',    
                max_iter=100, random_state=0)    
    km.fit(data) 

    return km 

km = cluster(reduced_data, 10) 

## Evaluation of the clusters

In [28]:
def evaluate(km, labels, svd): 
    print("Clustering report:\n") 

    print(f"* Homogeneity: {str(metrics.homogeneity_score(labels, km.labels_))}") 
    print(f"* Completeness: {str(metrics.completeness_score(labels, km.labels_))}") 
    print(f"* V-measure: {str(metrics.v_measure_score(labels, km.labels_))}")  
    print("\nMost discriminative words per cluster:") 
    # get centroids 
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)  
    # sort the centroids  
    order_centroids = original_space_centroids.argsort()[:, ::-1]  
    # mapping centroids back to words  
    terms = vectorizer.get_feature_names() 

    for i in range(num_clusters): 
        print("Cluster " + str(i) + ": ") 
        cl_terms = "" 
        for ind in order_centroids[i, :50]: 
            cl_terms += terms[ind] + " " 
        print(cl_terms + "\n")   
        
evaluate(km, all_news_labels, svd) 
print("\nCategories:") 
for i, category in enumerate(newsgroups_train.target_names): 
    print("*", category)

Clustering report:

* Homogeneity: 0.4356706249829362
* Completeness: 0.517119953001238
* V-measure: 0.47291394000106934

Most discriminative words per cluster:
Cluster 0: 
don like just know people space think time good use does ve right years make way long things problem thing going work really sure say new want better used did probably high said doctor ll lot need didn orbit cause nasa idea point little earth help launch medical actually day 

Cluster 1: 
bike ride bikes riding just like motorcycle dod don ve road miles good honda got rear helmet turn right know really advice thing dog make left rider new engine going way time little work countersteering need buying passenger gear ll want used sure insurance did think stop shaft fast thanks 

Cluster 2: 
thanks edu com does mail just new think list know like good got looking right did say ve heard ll cars want information tell post sure really let time address make article used engine dod use way need send probably read thought davi

