In [1]:

# import required sklearn libs
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# import other required libs
import pandas as pd
import numpy as np

# string manipulation libs
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# viz libs
import matplotlib.pyplot as plt
import seaborn as sns

from utils import findOptimalClustersKMeans

def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """This function cleans the input text by
    - removing links
    - removing special chars
    - removing numbers
    - removing stopwords
    - transforming in lower case
    - removing excessive whitespaces
    Arguments:
        text (str): text to clean
        remove_stopwords (bool): remove stopwords or not
    Returns:
        str: cleaned text
    """
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove numbers and special chars
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. creates tokens
        tokens = nltk.word_tokenize(text)
        # 2. checks if token is a stopword and removes it
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        # 3. joins all tokens again
        text = " ".join(tokens)
    # returns cleaned text
    text = text.lower().strip()
    return text
  
def get_top_keywords(n_terms):
    """This function returns the keywords for each centroid of the KMeans"""
    df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups tf idf vector per cluster
    terms = vectorizer.get_feature_names_out() # access to tf idf terms
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
            

categories = [
 'misc.forsale',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',]
dataset = fetch_20newsgroups(subset='train',
                            categories=categories,
                            shuffle=True,
                            remove=('headers',
                            'footers',
                            'quotes'))

df = pd.DataFrame(dataset.data,
columns=["corpus"])
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

# initialize vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
X = vectorizer.fit_transform(df['cleaned'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jakne\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jakne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp311-cp311-win_amd64.whl (24.0 MB)
     ---------------------------------------- 24.0/24.0 MB 1.0 MB/s eta 0:00:00
Installing collected packages: gensim
Successfully installed gensim-4.3.2



[notice] A new release of pip available: 22.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
corpus = df['cleaned']

In [11]:
import gensim
from typing import List

def tokenize_documents(documents: List[str]) -> List[List[str]]:
    """
    Tokenizes each document in the given list of documents.

    Args:
    documents (List[str]): A list of documents where each document is a string.

    Returns:
    List[List[str]]: A list of lists, where each inner list is a tokenized document.
    """
    return [gensim.utils.simple_preprocess(doc) for doc in documents]

# Assuming 'corpus' is a pandas series or a list of documents
tokenized_corpus = tokenize_documents(corpus)

# Transform to gensim dictionary
dic = gensim.corpora.Dictionary(tokenized_corpus)
bow_corpus = [dic.doc2bow(doc) for doc in tokenized_corpus]

import pickle
pickle.dump(bow_corpus, open('corpus.pkl', 'wb'))
dic.save('dictionary.gensim')

In [12]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 9,
                                    id2word = dic,
                                      passes = 10,
                                      workers = 2)
lda_model.save('model4.gensim')

In [13]:
for idx, topic in lda_model.print_topics(num_words=10):    
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"year" + 0.005*"would" + 0.004*"game" + 0.004*"one" + 0.004*"last" + 0.004*"think" + 0.004*"new" + 0.004*"good" + 0.004*"team" + 0.004*"get"
Topic: 1 
Words: 0.007*"would" + 0.004*"space" + 0.004*"earth" + 0.003*"also" + 0.003*"data" + 0.003*"system" + 0.003*"orbit" + 0.003*"center" + 0.002*"launch" + 0.002*"one"
Topic: 2 
Words: 0.014*"key" + 0.009*"would" + 0.008*"encryption" + 0.007*"chip" + 0.006*"clipper" + 0.006*"government" + 0.006*"use" + 0.005*"water" + 0.005*"keys" + 0.004*"one"
Topic: 3 
Words: 0.004*"gun" + 0.004*"one" + 0.003*"firearms" + 0.003*"much" + 0.003*"people" + 0.003*"well" + 0.003*"committee" + 0.003*"darren" + 0.003*"like" + 0.003*"think"
Topic: 4 
Words: 0.011*"file" + 0.010*"gun" + 0.007*"people" + 0.006*"would" + 0.004*"control" + 0.004*"one" + 0.004*"guns" + 0.004*"firearms" + 0.003*"states" + 0.003*"right"
Topic: 5 
Words: 0.019*"space" + 0.008*"nasa" + 0.007*"db" + 0.005*"dos" + 0.004*"edu" + 0.004*"one" + 0.003*"lunar" + 0.003*"info