### Perform topic modeling using LSA, LDA. Finally you should obtain chart of coherence score by the number of topics.

In [15]:
import operator
from string import punctuation
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim import corpora

from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

from textblob import TextBlob
import warnings

warnings.filterwarnings("ignore")

#### Dataset from Internet

##### LSA

In [5]:
dataset = fetch_20newsgroups(
    shuffle=True, random_state=42, remove=("headers", "footers", "quotes")
)
documents = dataset.data
df = pd.DataFrame({"document": documents})
df = df.iloc[:5000]
df.head()

Unnamed: 0,document
0,I was wondering if anyone out there could enli...
1,A fair number of brave souls who upgraded thei...
2,"well folks, my mac plus finally gave up the gh..."
3,\nDo you have Weitek's address/phone number? ...
4,"From article <C5owCB.n3p@world.std.com>, by to..."


In [9]:
tokenized_doc = df["document"].str.replace("[^a-zA-Z#]", " ")
lemmatizer = WordNetLemmatizer()


def clean_doc(text):
    # remove small words
    text = " ".join([w.lower() for w in text.split() if len(w) > 3])

    # remove stopwords
    stopwords_set = set(stopwords.words("english"))
    stopwords_and_punctuation = list(stopwords_set) + list(punctuation)
    text = word_tokenize(text)
    text = " ".join([word for word in text if word not in stopwords_and_punctuation])

    # lemmatize
    text = word_tokenize(text)
    lemmatized_text = []
    for word in text:
        lemmatized_text.append(lemmatizer.lemmatize(word))
    text = " ".join(lemmatized_text)
    return text


tokenized_doc = tokenized_doc.apply(clean_doc)
tokenized_doc

0       wondering anyone could enlighten day 2-door sp...
1       fair number brave soul upgraded clock oscillat...
2       well folk plus finally gave ghost weekend star...
3       weitek 's address/phone number like informatio...
4       article c5owcb.n3p world.std.com tombaker worl...
                              ...                        
4995                               claim system objective
4996    phenylketonuria disease body process phenylala...
4997    reply zazen austin.ibm.com welbon many atheist...
4998                                                     
4999    waterski bike turn handlebar left lean right l...
Name: document, Length: 5000, dtype: object

In [10]:
tfidf = TfidfVectorizer(
    stop_words="english", max_features=1000, max_df=0.5, smooth_idf=True
)

matrix = tfidf.fit_transform(tokenized_doc)

X = matrix.toarray()

In [11]:
svd_model = TruncatedSVD(
    n_components=5, algorithm="randomized", n_iter=100, random_state=42
)
svd_model.fit(X)
print("The number of topics chosen are", len(svd_model.components_))

The number of topics chosen are 5


In [12]:
terms = tfidf.get_feature_names_out()
topics = []

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7]
    topics.append("Topic " + str(i) + ": ")
    for t in sorted_terms:
        topics.append(t[0])

final_topic_list = [topics[i : i + 8] for i in range(0, len(topics), 8)]

for x in final_topic_list:
    print(x)

['Topic 0: ', 'like', 'know', 'people', 'think', 'time', 'good', 'year']
['Topic 1: ', 'window', 'file', 'drive', 'thanks', 'card', 'program', 'problem']
['Topic 2: ', 'game', 'drive', 'team', 'player', 'year', 'play', 'season']
['Topic 3: ', 'drive', 'scsi', 'problem', 'people', 'hard', 'disk', 'christian']
['Topic 4: ', 'file', 'window', 'game', 'problem', 'think', 'team', 'program']


##### LDA

In [19]:
stopwords_set = set(stopwords.words("english"))
punctuation_set = set(punctuation)
lemmatizer = WordNetLemmatizer()


# Function to lemmatize and remove the stopwords
def clean_doc(doc):
    without_stopwords = " ".join(
        [i for i in doc.lower().split() if i not in stopwords_set]
    )
    without_puntuaction = "".join(
        ch for ch in without_stopwords if ch not in punctuation_set
    )
    normalized = " ".join(
        lemmatizer.lemmatize(word) for word in without_puntuaction.split()
    )
    return normalized


tokenized_doc = df["document"].str.replace("[^a-zA-Z#]", " ")
list_of_docs = tokenized_doc.tolist()
doc_clean = [clean_doc(doc).split() for doc in list_of_docs]

In [20]:
dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

ldamodel = LdaModel(
    corpus=doc_term_matrix, num_topics=5, id2word=dictionary, random_state=42, passes=30
)

pprint(ldamodel.print_topics())

[(0,
  '0.011*"armenian" + 0.006*"turkish" + 0.005*"jew" + 0.005*"israel" + '
  '0.004*"israeli" + 0.003*"turkey" + 0.003*"muslim" + 0.003*"russian" + '
  '0.003*"village" + 0.003*"arab"'),
 (1,
  '0.008*"would" + 0.008*"one" + 0.006*"people" + 0.005*"like" + 0.004*"think" '
  '+ 0.004*"know" + 0.004*"it" + 0.004*"get" + 0.004*"time" + 0.004*"well"'),
 (2,
  '0.027*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.021*"x" + 0.007*"file" + '
  '0.006*"window" + 0.005*"use" + 0.004*"program" + 0.004*"image" + '
  '0.003*"system" + 0.003*"one" + 0.003*"widget"'),
 (3,
  '0.028*"1" + 0.014*"2" + 0.013*"0" + 0.009*"3" + 0.009*"game" + 0.008*"db" + '
  '0.006*"4" + 0.006*"team" + 0.006*"5" + 0.005*"play"'),
 (4,
  '0.006*"file" + 0.006*"system" + 0.006*"key" + 0.006*"space" + '
  '0.005*"information" + 0.004*"internet" + 0.004*"computer" + 0.004*"privacy" '
  '+ 0.004*"email" + 0.004*"anonymous"')]


##### Coherence score

In [21]:
# Compute Perplexity
perplexity_lda = ldamodel.log_perplexity(doc_term_matrix)
print("\nPerplexity: ", perplexity_lda)


# Compute Coherence Score
coherence_model_lda = CoherenceModel(
    model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)


Perplexity:  -8.90309896669351

Coherence Score:  0.6271698531660788


In [22]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
plot = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
plot

#### Local dataset
##### LSA

In [35]:
df = pd.read_csv("small_wikidump.csv")
df = df.sample(5000, random_state=42)
df.head()

Unnamed: 0,id,url,title,text
70985,835014,https://simple.wikipedia.org/wiki/Humraaz,Humraaz,Humraaz () is a 2002 Indian Hindi-language mus...
175490,674066,https://simple.wikipedia.org/wiki/Yehoshua%20G...,Yehoshua Glazer,Yehoshua Glazer (29 December 1927 - 29 Decembe...
158572,664314,https://simple.wikipedia.org/wiki/2018%20Leice...,2018 Leicester helicopter crash,"On 27 October 2018, an AgustaWestland AW169 he..."
82591,658224,https://simple.wikipedia.org/wiki/Randolph%20C...,"Randolph County, Illinois",Randolph County is a county in the U.S. state ...
134171,508164,https://simple.wikipedia.org/wiki/The%20Real%2...,The Real Adventures of Jonny Quest,The Real Adventures of Jonny Quest is an Ameri...


In [36]:
tokenized_doc = df["text"].str.replace("[^a-zA-Z#]", " ")
lemmatizer = WordNetLemmatizer()


def clean_doc(text):
    # remove small words
    text = " ".join([w.lower() for w in text.split() if len(w) > 3])

    # remove stopwords
    stopwords_set = set(stopwords.words("english"))
    stopwords_and_punctuation = list(stopwords_set) + list(punctuation)
    text = word_tokenize(text)
    text = " ".join([word for word in text if word not in stopwords_and_punctuation])

    # lemmatize
    text = word_tokenize(text)
    lemmatized_text = []
    for word in text:
        lemmatized_text.append(lemmatizer.lemmatize(word))
    text = " ".join(lemmatized_text)
    return text


tokenized_doc = tokenized_doc.apply(clean_doc)
tokenized_doc

70985     humraaz 2002 indian hindi-language musical rom...
175490    yehoshua glazer december 1927 december 2018 is...
158572    october 2018 agustawestland aw169 helicopter c...
82591     randolph county county u.s. state illinois 202...
134171    real adventure jonny quest american animated a...
                                ...                        
196290    edelweiss well-known europe mountain flower de...
208656    viva zalata 1976 egyptian western comedy movie...
7584      husavik town northern iceland population 2,237...
16636     international astronomical union iau internati...
22733     girl group music group group female singer nor...
Name: text, Length: 5000, dtype: object

In [37]:
tfidf = TfidfVectorizer(
    stop_words="english", max_features=1000, max_df=0.5, smooth_idf=True
)

matrix = tfidf.fit_transform(tokenized_doc)

X = matrix.toarray()

In [38]:
svd_model = TruncatedSVD(
    n_components=5, algorithm="randomized", n_iter=100, random_state=42
)
svd_model.fit(X)
print("The number of topics chosen are", len(svd_model.components_))

The number of topics chosen are 5


In [39]:
terms = tfidf.get_feature_names_out()
topics = []

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7]
    topics.append("Topic " + str(i) + ": ")
    for t in sorted_terms:
        topics.append(t[0])

final_topic_list = [topics[i : i + 8] for i in range(0, len(topics), 8)]

for x in final_topic_list:
    print(x)

['Topic 0: ', 'commune', 'france', 'department', 'region', 'county', 'city', 'north']
['Topic 1: ', 'movie', 'county', 'city', 'state', 'american', 'actor', 'united']
['Topic 2: ', 'movie', 'actor', 'american', 'directed', 'television', 'comedy', 'drama']
['Topic 3: ', 'municipality', 'district', 'canton', 'switzerland', 'website', 'germany', 'province']
['Topic 4: ', 'movie', 'county', 'directed', 'drama', 'comedy', 'town', 'municipality']


##### LDA

In [41]:
stopwords_set = set(stopwords.words("english"))
punctuation_set = set(punctuation)
lemmatizer = WordNetLemmatizer()


# Function to lemmatize and remove the stopwords
def clean_doc(doc):
    without_stopwords = " ".join(
        [i for i in doc.lower().split() if i not in stopwords_set]
    )
    without_puntuaction = "".join(
        ch for ch in without_stopwords if ch not in punctuation_set
    )
    normalized = " ".join(
        lemmatizer.lemmatize(word) for word in without_puntuaction.split()
    )
    return normalized


tokenized_doc = df["text"].str.replace("[^a-zA-Z#]", " ")
list_of_docs = tokenized_doc.tolist()
doc_clean = [clean_doc(doc).split() for doc in list_of_docs]

In [42]:
dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

ldamodel = LdaModel(
    corpus=doc_term_matrix, num_topics=5, id2word=dictionary, random_state=42, passes=30
)

pprint(ldamodel.print_topics())

[(0,
  '0.008*"county" + 0.007*"state" + 0.006*"people" + 0.006*"also" + '
  '0.005*"united" + 0.005*"city" + 0.005*"used" + 0.005*"one" + '
  '0.004*"reference" + 0.004*"many"'),
 (1,
  '0.012*"american" + 0.011*"movie" + 0.005*"actor" + 0.005*"death" + '
  '0.005*"reference" + 0.004*"born" + 0.004*"birth" + 0.004*"television" + '
  '0.004*"new" + 0.004*"people"'),
 (2,
  '0.016*"commune" + 0.013*"municipality" + 0.011*"reference" + '
  '0.010*"district" + 0.009*"department" + 0.008*"region" + 0.008*"france" + '
  '0.007*"website" + 0.006*"language" + 0.004*"north"'),
 (3,
  '0.068*"km" + 0.068*"alignright" + 0.042*"linear" + 0.042*"socorro" + '
  '0.036*"bgcolorfefefe" + 0.017*"bgcolore9e9e9" + 0.015*"2001" + 0.015*"1999" '
  '+ 0.015*"2000" + 0.013*"bgcolord6d6d6"'),
 (4,
  '0.009*"player" + 0.008*"reference" + 0.007*"team" + 0.007*"league" + '
  '0.006*"football" + 0.005*"people" + 0.005*"birth" + 0.005*"national" + '
  '0.005*"world" + 0.005*"played"')]


##### Coherence score

In [43]:
# Compute Perplexity
perplexity_lda = ldamodel.log_perplexity(doc_term_matrix)
print("\nPerplexity: ", perplexity_lda)


# Compute Coherence Score
coherence_model_lda = CoherenceModel(
    model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)


Perplexity:  -8.719808438408297

Coherence Score:  0.6213836427516055


In [44]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
plot = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
plot