In [1]:
from sklearn.datasets import fetch_20newsgroups

import torch
from gensim import corpora, models
from gensim.utils import simple_preprocess

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [14]:
import numpy as np
import pandas as pd
import random

In [4]:
folder = "./20news_lda_lsi/"
kmeans_res_path = "kmeans_res_20News_"
hierarchy_res_path = "hierarchy_res_20News_"


categories_light = ["comp.graphics", "rec.autos", "sci.med",  "talk.politics.mideast"]
categories_hard = ["talk.politics.guns", "talk.politics.mideast",  "talk.politics.misc"]
categories = {"light": categories_light, "hard": categories_hard}

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    
    tokens = [word.lower() for word in tokens if word.isalnum()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words ]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [6]:
def filter_text_by_pos(text, pos_to_keep):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    filtered_tokens = [token for token, pos in tagged_tokens if pos in pos_to_keep]
    
    return filtered_tokens

In [7]:
def cluster_kmeans(matrix, metrics, true_labels, num_clusters=2, num_iterations=50):
    scores = {}
    for metric in metrics:
        scores.update({metric.__name__: []})

    for i in range(num_iterations):
        clusters = KMeans(n_clusters=num_clusters, random_state=i, n_init=10)

        clusters.fit_predict(matrix)
        
        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)

    kmeans_res = ""
    for metric in scores:
        kmeans_res += f"\n{metric} \nMax: {np.max(scores[metric])} \
                                    \nMin: {np.min(scores[metric])} \
                                    \nAVG: {np.mean(scores[metric])} \n"
        
    print(kmeans_res)
    return kmeans_res

In [8]:
def cluster_hierarchy(matrix, metrics, true_labels, num_clusters=2):
    linkages = ["complete", "average", "single"]

    hierarchy_res = ""

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        # matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        hierarchy_res += f"\n{linkage}"

        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit_predict(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            hierarchy_res += f"\n{metric.__name__}: {score}"
        
        hierarchy_res += "\n"
    
    print(hierarchy_res)
    return hierarchy_res

In [20]:
def main(dataset):
    num_topics = 10
    true_labels = dataset.target
    distances = ["none", "euclidean", "cosine"]
    metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

    kmeans_data = pd.DataFrame(columns=distances)
    hierarchy_data = pd.DataFrame(columns=distances)

    preprocessed_data = [preprocess_text(text) for text in dataset.data]
    n = len(preprocessed_data)
    print(n)

    noun_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS']) for text in preprocessed_data]
    adj_data = [filter_text_by_pos(text, pos_to_keep=['JJ', 'JJR', 'JJS']) for text in preprocessed_data]
    noun_adj_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS', 'JJ', 'JJR', 'JJS']) for text in preprocessed_data]

    preprocessed_data = [text.split(' ') for text in preprocessed_data]
    list_of_data = {"ALL": preprocessed_data, "NOUNS": noun_data, "ADJ": adj_data, "NOUNS and ADJ": noun_adj_data}

    for name, data in list_of_data.items():
        print("start calculate " + name)
        text_vectors = []

        dictionary = corpora.Dictionary(data)

        bow_corpus = [dictionary.doc2bow(doc) for doc in data]

        model = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=num_topics)

        for doc_bow in bow_corpus:
            document_topics = model[doc_bow]
            document_topic_vector = [float(topic_prob) for _, topic_prob in document_topics]
            if document_topic_vector == [] or len(document_topic_vector) < num_topics:
                document_topic_vector = [random.uniform(-0.00001, 0.00001) for _ in range(num_topics)]
            text_vectors.append(document_topic_vector)

        for distance in distances:
            print(distance)
            if distance == "euclidean":
                distance_matrix = euclidean_distances(text_vectors)
            elif distance == "cosine":
                distance_matrix = cosine_distances(text_vectors)
            elif distance == "none":
                distance_matrix = np.array(text_vectors)

            print(distance_matrix.shape)

            kmeans_data.loc[name, distance] = cluster_kmeans(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

            hierarchy_data.loc[name, distance] = cluster_hierarchy(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

    return kmeans_data, hierarchy_data

In [10]:
news_data = {}
for category in categories:
    newsgroups = fetch_20newsgroups(subset="train",
                                remove=("headers", "footers", "quotes"),
                                categories=categories[category])
    news_data.update({category: newsgroups})

In [11]:
name_data = [key for key in news_data][0]
name_data

'light'

In [21]:
for data in news_data:
    print(data)
    kmeans_data, hierarchy_data = main(news_data[data])
    
    print("Saved")
    kmeans_data.to_excel(folder + "lsi" + "/" + kmeans_res_path + data + ".xlsx")
    hierarchy_data.to_excel(folder + "lsi" + "/" + hierarchy_res_path + data + ".xlsx")

light
2336
start calculate ALL
none
(2336, 10)

normalized_mutual_info_score 
Max: 0.007666133223518858                                     
Min: 0.00681646869276016                                     
AVG: 0.007615153351673336 

adjusted_rand_score 
Max: 0.00011699278835083276                                     
Min: 9.21218674679546e-05                                     
AVG: 0.00011550053309786003 

v_measure_score 
Max: 0.007666133223518859                                     
Min: 0.006816468692760161                                     
AVG: 0.007615153351673336 

homogeneity_score 
Max: 0.003912910104408967                                     
Min: 0.0034726490801218887                                     
AVG: 0.0038864944429517424 


complete
normalized_mutual_info_score: 0.007666133223518858
adjusted_rand_score: 0.00011699278835083276
v_measure_score: 0.007666133223518859
homogeneity_score: 0.003912910104408967

average
normalized_mutual_info_score: 0.0033962876235603748
