In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import DistanceMetric
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string
import re

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
kmeans_res_path = "kmeans_res_20News_hard.xlsx"
hierarchy_res_path = "hierarchy_res_20News_hard.xlsx"

categories = ["talk.politics.guns", "talk.politics.mideast",  "talk.politics.misc"]

In [5]:
vectorizer = TfidfVectorizer(max_features=5000)

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    
    tokens = [word.lower() for word in tokens if word.isalnum()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words ]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [7]:
def filter_text_by_pos(text, pos_to_keep):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    filtered_tokens = [token for token, pos in tagged_tokens if pos in pos_to_keep]
    
    return ' '.join(filtered_tokens)

In [8]:
def cluster_kmeans(matrix, metrics, true_labels, num_clusters=2, num_iterations=50):
    scores = {}
    for metric in metrics:
        scores.update({metric.__name__: []})

    for i in range(num_iterations):
        clusters = KMeans(n_clusters=num_clusters, random_state=i, n_init=10)

        clusters.fit_predict(matrix)
        
        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)

    kmeans_res = ""
    for metric in scores:
        kmeans_res += f"\n{metric} \nMax: {np.max(scores[metric])} \
                                    \nMin: {np.min(scores[metric])} \
                                    \nAVG: {np.mean(scores[metric])} \n"
        
    print(kmeans_res)
    return kmeans_res

In [9]:
def cluster_hierarchy(matrix, metrics, true_labels, num_clusters=2):
    linkages = ["complete", "average", "single"]

    hierarchy_res = ""

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        hierarchy_res += f"\n{linkage}"

        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit_predict(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            hierarchy_res += f"\n{metric.__name__}: {score}"
        
        hierarchy_res += "\n"
    
    print(hierarchy_res)
    return hierarchy_res

In [10]:
def main(dataset):
    true_labels = dataset.target
    distances = ["none", "euclidean", "jaccard", "cosine"]
    metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

    kmeans_data = pd.DataFrame(columns=distances)
    hierarchy_data = pd.DataFrame(columns=distances)

    preprocessed_data = [preprocess_text(text) for text in dataset.data]
    print(preprocessed_data[0])

    noun_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS']) for text in preprocessed_data]
    adj_data = [filter_text_by_pos(text, pos_to_keep=['JJ', 'JJR', 'JJS']) for text in preprocessed_data]
    noun_adj_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS', 'JJ', 'JJR', 'JJS']) for text in preprocessed_data]

    list_of_data = {"ALL": preprocessed_data, "NOUNS": noun_data, "ADJ": adj_data, "NOUNS and ADJ": noun_adj_data}

    for name, data in list_of_data.items():
        text_vectors = vectorizer.fit_transform(data)
        
        for distance in distances:
            if distance == "euclidean":
                distance_matrix = euclidean_distances(text_vectors)
            elif distance == "jaccard":
                dist = DistanceMetric.get_metric(distance)
                distance_matrix = dist.pairwise(text_vectors)
            elif distance == "cosine":
                distance_matrix = cosine_distances(text_vectors)
            elif distance == "none":
                distance_matrix = text_vectors

            kmeans_data.loc[name, distance] = cluster_kmeans(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

            hierarchy_data.loc[name, distance] = cluster_hierarchy(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

    return kmeans_data, hierarchy_data

In [11]:
newsgroups = fetch_20newsgroups(subset="train",
                                remove=("headers", "footers", "quotes"),
                                categories=categories)

In [12]:
kmeans_data, hierarchy_data = main(newsgroups)

bit unfair call blame clinton administration alone initiative underway bush administration basically bipartisan effort establishment demopublicans republicrats bipartisan effort brought l scandal bcci etc

normalized_mutual_info_score 
Max: 0.2575767580781821                                     
Min: 0.07477041758813775                                     
AVG: 0.17853997716852646 

adjusted_rand_score 
Max: 0.11900313214863209                                     
Min: 0.013939403502231081                                     
AVG: 0.06466545561004997 

v_measure_score 
Max: 0.2575767580781821                                     
Min: 0.07477041758813775                                     
AVG: 0.17853997716852646 

homogeneity_score 
Max: 0.2065823530670431                                     
Min: 0.06727287868859523                                     
AVG: 0.1457907704522163 


complete
normalized_mutual_info_score: 0.015575683471545494
adjusted_rand_score: 0.01750161477345245
v_me

In [13]:
from IPython.display import display

display(kmeans_data.style.set_properties(**{
    'text-align': 'center',
    'white-space': 'pre-wrap',
}))

Unnamed: 0,none,euclidean,jaccard,cosine
ALL,normalized_mutual_info_score Max: 0.2575767580781821 Min: 0.07477041758813775 AVG: 0.17853997716852646 adjusted_rand_score Max: 0.11900313214863209 Min: 0.013939403502231081 AVG: 0.06466545561004997 v_measure_score Max: 0.2575767580781821 Min: 0.07477041758813775 AVG: 0.17853997716852646 homogeneity_score Max: 0.2065823530670431 Min: 0.06727287868859523 AVG: 0.1457907704522163,normalized_mutual_info_score Max: 0.008726519194393061 Min: 0.006924104886369169 AVG: 0.0077889499663855695 adjusted_rand_score Max: 0.004622426554444271 Min: 0.003031830803614357 AVG: 0.003624708951221679 v_measure_score Max: 0.008726519194393063 Min: 0.00692410488636917 AVG: 0.0077889499663855695 homogeneity_score Max: 0.007389679885176083 Min: 0.005880202385983358 AVG: 0.006603508214778622,normalized_mutual_info_score Max: 0.005864057672236101 Min: 0.004637807876769599 AVG: 0.005303272180510014 adjusted_rand_score Max: 0.0026209081768038688 Min: 0.001822985378228042 AVG: 0.0022924753098073213 v_measure_score Max: 0.005864057672236101 Min: 0.004637807876769598 AVG: 0.005303272180510016 homogeneity_score Max: 0.004534677917639503 Min: 0.0035720648358511394 AVG: 0.0040884570335681936,normalized_mutual_info_score Max: 0.0997083504325801 Min: 0.0662606208038536 AVG: 0.07232698689040756 adjusted_rand_score Max: 0.021892511071235726 Min: 0.007822388463516648 AVG: 0.010302343709197333 v_measure_score Max: 0.09970835043258008 Min: 0.06626062080385359 AVG: 0.07232698689040756 homogeneity_score Max: 0.08935009875123848 Min: 0.058529003167999444 AVG: 0.06402886077633346
NOUNS,normalized_mutual_info_score Max: 0.20698421110730272 Min: 0.0739563367594459 AVG: 0.10888355549145924 adjusted_rand_score Max: 0.08384912173679739 Min: 0.00925545366419512 AVG: 0.03262336955920671 v_measure_score Max: 0.2069842111073027 Min: 0.0739563367594459 AVG: 0.10888355549145924 homogeneity_score Max: 0.18768532299353816 Min: 0.06421897074613185 AVG: 0.09638288404317978,normalized_mutual_info_score Max: 0.012053930872211033 Min: 0.010450751953214982 AVG: 0.011161415185852182 adjusted_rand_score Max: 0.006420385443499765 Min: 0.005291733755045739 AVG: 0.00558960474621667 v_measure_score Max: 0.012053930872211033 Min: 0.010450751953214984 AVG: 0.011161415185852182 homogeneity_score Max: 0.01019411675303967 Min: 0.008859477732728037 AVG: 0.009429654048499728,normalized_mutual_info_score Max: 0.006832936351753703 Min: 0.0017598031189546015 AVG: 0.005111950791520465 adjusted_rand_score Max: 0.0023536697735660377 Min: -0.0012051061372905026 AVG: 0.0014195924654210965 v_measure_score Max: 0.006832936351753702 Min: 0.0017598031189546013 AVG: 0.005111950791520465 homogeneity_score Max: 0.004686120453616787 Min: 0.0013099019551942833 AVG: 0.00350593622539261,normalized_mutual_info_score Max: 0.17105084186635555 Min: 0.07983624267062173 AVG: 0.15879281071874005 adjusted_rand_score Max: 0.06774854189643939 Min: 0.01359386102736153 AVG: 0.05925955885854489 v_measure_score Max: 0.17105084186635555 Min: 0.07983624267062173 AVG: 0.15879281071874005 homogeneity_score Max: 0.15712449461285322 Min: 0.07060817228739939 AVG: 0.14530988777386267
ADJ,normalized_mutual_info_score Max: 0.20827438668026307 Min: 0.07249196893072975 AVG: 0.20082292673614355 adjusted_rand_score Max: 0.04841697036752408 Min: 0.01605596944113831 AVG: 0.04482069927705345 v_measure_score Max: 0.2082743866802631 Min: 0.07249196893072975 AVG: 0.20082292673614355 homogeneity_score Max: 0.1527364366372096 Min: 0.06342005864377581 AVG: 0.14673162129640538,normalized_mutual_info_score Max: 0.07691524500742701 Min: 0.0743475868000522 AVG: 0.07650444859714925 adjusted_rand_score Max: 0.006303828875079575 Min: 0.005585343199140241 AVG: 0.006188170231106085 v_measure_score Max: 0.07691524500742701 Min: 0.07434758680005221 AVG: 0.07650444859714925 homogeneity_score Max: 0.05437545718065263 Min: 0.052382391919701565 AVG: 0.054056388951750495,normalized_mutual_info_score Max: 0.0025216886768706036 Min: 0.0008629347977612776 AVG: 0.0010483028014809184 adjusted_rand_score Max: -0.00045658823404982543 Min: -0.0014145652164944697 AVG: -0.0006430584419605051 v_measure_score Max: 0.002521688676870604 Min: 0.0008629347977612776 AVG: 0.0010483028014809184 homogeneity_score Max: 0.0015824376961436637 Min: 0.0005972365054639917 AVG: 0.0007222983878504649,normalized_mutual_info_score Max: 0.19408107175874562 Min: 0.1923235182201654 AVG: 0.1932853060334692 adjusted_rand_score Max: 0.039856340728896676 Min: 0.038864772649483056 AVG: 0.039401090623676624 v_measure_score Max: 0.19408107175874562 Min: 0.19232351822016544 AVG: 0.1932853060334692 homogeneity_score Max: 0.14009646180975754 Min: 0.13854112273214678 AVG: 0.13939507270476809
NOUNS and ADJ,normalized_mutual_info_score Max: 0.26420571641861385 Min: 0.07968811522144838 AVG: 0.22080179131459976 adjusted_rand_score Max: 0.23509182745516963 Min: 0.01718840073508627 AVG: 0.0788333890598263 v_measure_score Max: 0.2642057164186138 Min: 0.07968811522144838 AVG: 0.2208017913145998 homogeneity_score Max: 0.23859881554014944 Min: 0.07174660009978513 AVG: 0.1744260300547539,normalized_mutual_info_score Max: 0.04684072952132915 Min: 0.03183125789656244 AVG: 0.03751071380413923 adjusted_rand_score Max: 0.03748816600882167 Min: 0.025187329254870123 AVG: 0.03107794134350395 v_measure_score Max: 0.046840729521329144 Min: 0.031831257896562444 AVG: 0.03751071380413923 homogeneity_score Max: 0.039026748739820545 Min: 0.02675945746762099 AVG: 0.03153447701152641,normalized_mutual_info_score Max: 0.005657845109119625 Min: 0.0008419897349122209 AVG: 0.003983523770173786 adjusted_rand_score Max: 0.0014548417623801507 Min: -0.0009226038049627591 AVG: 0.0008784748996677864 v_measure_score Max: 0.0056578451091196255 Min: 0.0008419897349122208 AVG: 0.003983523770173786 homogeneity_score Max: 0.003764323476013816 Min: 0.000540793581372391 AVG: 0.00270584748739923,normalized_mutual_info_score Max: 0.10686710826759717 Min: 0.07380760280418258 AVG: 0.07762244073397034 adjusted_rand_score Max: 0.029507935062356945 Min: 0.012355030237467032 AVG: 0.014725025552786824 v_measure_score Max: 0.10686710826759718 Min: 0.07380760280418257 AVG: 0.07762244073397034 homogeneity_score Max: 0.09576802736498978 Min: 0.06544368810808406 AVG: 0.06878505533888869


In [14]:
kmeans_data.to_excel(kmeans_res_path)
hierarchy_data.to_excel(hierarchy_res_path)