# Topic Modelling

In [15]:
import gensim
import nltk
import gensim.corpora as corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
import re
import pyLDAvis as lv

# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('stopwords')
# !python -m spacy download en_core_web_sm

# for visualizing
# !pip install pyLDAvis

In [5]:
# defining the corpus
documents = [
    "The company's new marketing strategy led to a significant increase in sales."
    "Advancements in AI technology are revolutionizing the healthcare industry."
    "Remote work has become more common due to the global pandemic."
    "Sustainable energy solutions are critical for reducing carbon emissions."
    "The latest smartphone model features a highly responsive screen and a sleek design."
    "E-commerce platforms have seen a surge in user engagement during the holiday season."
    "Blockchain technology offers enhanced security for financial transactions."
    "Educational institutions are increasingly adopting online learning platforms."
    "Climate change poses a serious threat to global ecosystems and biodiversity."
    "Social media influencers play a significant role in modern digital marketing campaigns."
    ]

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

In [12]:
id2words = corpora.Dictionary(processed_docs)
corpus = [id2words.doc2bow(text) for text in processed_docs]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 1), (63, 1), (64, 1)]]


In [13]:
# building the model
lda_model = LdaModel(
    corpus = corpus,
    id2word=id2words,
    num_topics=2,
    random_state=42,
    update_every=1,
    chunksize=10,
    passes=10,
    alpha='auto',
    )

topics = lda_model.print_topics()
for topic in topics:
    print(topic)

(0, '0.016*"global" + 0.016*"significant" + 0.016*"marketing" + 0.016*"technology" + 0.015*"responsive" + 0.015*"seen" + 0.015*"due" + 0.015*"ai" + 0.015*"sleek" + 0.015*"medium"')
(1, '0.025*"technology" + 0.025*"marketing" + 0.025*"significant" + 0.025*"global" + 0.015*"critical" + 0.015*"work" + 0.015*"led" + 0.015*"engagement" + 0.015*"financial" + 0.015*"highly"')
