In [1]:
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chamuditha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chamuditha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chamuditha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

# Example document
document = """Sri Lanka President Anura Kumara Dissanayake has ordered the Secretary to the Ministry of Public Security to conduct a transparent and impartial investigation into the Easter Sunday bombing, a statement from his office said.

President Dissanayake appointed former police Deputy Inspector General Ravi Seneviratne, who oversaw Criminal Investigation Department probe in to the Easter Sunday bombing as the Secretary to the Public Security Ministry as one of the first acts after being elected.

There is a widespread belief in society that the Easter Sunday attacks may have been carried out to gain political mileage, President Dissanayake was quoted as saying when he visited St Sebastian’s Church in Katuwapitiya Sunday.

If hundreds of innocent lives were sacrificed for political purposes, it would be a profound tragedy, he said.

If politics in the country has reached such an extreme, the first priority must be to eliminate this dangerous situation.

Head of the then CID, Shani Abeyesekera who was conducting the investigation was removed from his post in 2019 when President Gotabaya Rajapaksa was elected and later arrested on what courts said was fabricated evidenced.

Related

Appeal court slams Sri Lanka police for framing top detective

Sri Lanka top detective Shani Abeysekera discharged from fabricated case

Abeysekera told court that the military intelligence had misled investigators who were going after the bombers, months before the suicide bombings took place.

Meanwhile President Dissanayake had said there were “growing suspicions that the government apparatus at the time may have been involved in the attacks.”

“If such allegations are true, he warned, the country would remain in a dangerously unstable and insecure state.

“Therefore, it is of utmost importance to uncover the truth behind these events, he emphasized.”

The head of the Sri Lanka’s State Intelligence Service Retired Major General Suresh Sallay was replaced this week, bringing the unit back under police control. The SIS had shifted to military control under Gotabaya Rajapaksa’s presidency,

President Dissanayake had met survivors of the blasts and their family members at the Church Sunday. (Colombo/Oct05/2024)'"""


In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Chamuditha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# Tokenize sentences
sentences = sent_tokenize(document)

In [6]:
# Preprocess sentences
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
processed_sentences = []

for sentence in sentences:
    words = sentence.lower().split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    processed_sentences.append(words)

In [7]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_sentences)
corpus = [dictionary.doc2bow(text) for text in processed_sentences]

In [8]:
# Apply LDA
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)
topics = lda_model.print_topics()

In [9]:
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.029*"dissanayake" + 0.029*"president" + 0.020*"church" + 0.020*"sunday." + 0.020*"easter" + 0.020*"sunday" + 0.012*"first" + 0.012*"gotabaya" + 0.012*"may" + 0.012*"political"
Topic 1: 0.020*"sri" + 0.020*"court" + 0.014*"police" + 0.014*"country" + 0.014*"intelligence" + 0.014*"said" + 0.014*"head" + 0.014*"detective" + 0.014*"lanka" + 0.014*"shani"
Topic 2: 0.028*"said." + 0.016*"investigation" + 0.016*"security" + 0.016*"ministry" + 0.016*"public" + 0.016*"secretary" + 0.016*"dissanayake" + 0.016*"lanka" + 0.016*"would" + 0.016*"sunday"


In [10]:
#check the performance matrix
#coherence score
from gensim.models import CoherenceModel

In [11]:
#create the coherence model using the LDA model to compute the coherence score
coherence_model_lda = CoherenceModel(model = lda_model, 
                                     texts = processed_sentences,
                                     dictionary = dictionary, 
                                     coherence = 'c_v') #a coherence measure

In [12]:
coherence_score = coherence_model_lda.get_coherence()
print("Coherence score of the LDA model : ", coherence_score)

Coherence score of the LDA model :  0.6579852075620177


In [13]:
import pickle

with open('../static/model/lda_small_DB.pickle', 'wb') as file:
    pickle.dump(lda_model, file)

In [14]:
with open('../static/model/dictionary_small_DB.pickle', 'wb') as file:
    pickle.dump(dictionary, file)