Getting the data from corpus and building the dataframe

In [57]:
# panda and sqlite import
import sqlite3
import pandas as pd

# data base path
db_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/processed_articles.db'

# open the connection to the data base
conn = sqlite3.connect(db_path)

# create the pandas data frame with full table data
query = "SELECT * FROM articles"
df_corpus = pd.read_sql_query(query, conn)

# Close the connection after loading the data
conn.close()

Custom method to extract the word frequency vectors from highlights text.

Tunning done on how often the word occurs and how least it occurs, eliminating the outliers from the topic modeling consideration.

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

# custom function to process the text and vectorize 
def preprocess_text(text_corpus):
    # Addition words to ignore
    custom_stop_words = ['day', 'say', 'says', 'time', 'got', 'new', 'week','said']  # Replace with the words you want to remove

    # Combining them with default English stop words
    combined_stop_words = set(custom_stop_words).union(set(CountVectorizer(stop_words='english').get_stop_words()))

    # Create the vercorizer
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=list(combined_stop_words)) # parameter tunning of max and minimum frequencies
    dt_matrix = vectorizer.fit_transform(text_corpus)
    return dt_matrix, vectorizer

# generating the document frequncy model and getting the learned vectorizer on vocabulary
dt_matrix, vectorizer = preprocess_text(df_corpus['highlights'])


Using the word vectors to implement the LDA algorithm to find the best words to model the topics

In [59]:
from sklearn.decomposition import LatentDirichletAllocation

# Creating the LDA object
num_topics = 4
lda = LatentDirichletAllocation(n_components=num_topics, random_state=123) # number of topics and random state set for easy tunning, same results on different parameters

# Fit the LDA model to the document-term matrix and retrieve the topic distribution
lda.fit(dt_matrix)
lda_topic_dist = lda.transform(dt_matrix)





Iterating throught the topics and displaying them. Sorted in as low to high value. Top 5 are the last 5 topic word distribution rows.

In [60]:
# Custom function to display topics and their top words
def display_topics(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}: ", [feature_names[i] for i in topic.argsort()[-num_top_words:]])

# Display topics
display_topics(lda, vectorizer.get_feature_names_out())



Topic 1:  ['security', 'suicide', 'house', 'iraq', 'people', 'told', 'attack', 'syria', 'mama', 'president']
Topic 2:  ['hospital', 'america', 'killed', 'world', 'official', 'died', 'children', 'united', 'police', 'people']
Topic 3:  ['women', 'home', 'inland', 'china', 'long', 'baby', 'just', 'australia', 'people', 'year']
Topic 4:  ['rock', 'world', 'year', 'reported', 'people', 'games', 'shake', 'seen', 'young', 'white']


Assigning topic numbers and custom labels for furthur processing.

In [61]:
# Assigning a topic out of the 4 modeled
df_corpus['assigned_topic'] = lda_topic_dist.argmax(axis=1)

# Custom labels for the topic document probabilities
topic_labels = {
    0: "Middle East Affairs",
    1: "Safty of children",
    2: "Domestic and Social Issues in China and Australia",
    3: "Popular Culture and Key Events",
}

# Map for the numeric to text form of topics
df_corpus['topic_label'] = df_corpus['assigned_topic'].map(topic_labels)

# Sample head display
df_corpus[['highlights', 'assigned_topic', 'topic_label']].head()


Unnamed: 0,highlights,assigned_topic,topic_label
0,japan's chief cabinet secretary Yoshihide suga...,3,Popular Culture and Key Events
1,syria official mama climbed to the top of the ...,0,Middle East Affairs
2,The employee in agency canvas City office is a...,2,Domestic and Social Issues in China and Australia
3,NEW A canada doctor says she was part of a tea...,3,Popular Culture and Key Events
4,NEW 4 groups announce legal challenge in Phoen...,3,Popular Culture and Key Events


In [62]:
import sqlite3

# create the new Database with topics
conn = sqlite3.connect('Modeler_output_topics.db')

# Data frame to data base convertion
df_corpus.to_sql('topic_data', conn, if_exists='replace', index=False)

# Validation of the data inserted
query = "SELECT highlights, assigned_topic, topic_label FROM topic_data LIMIT 5;"
df_verification = pd.read_sql(query, conn)
print(df_verification)

# Close connection
conn.close()


                                          highlights  assigned_topic  \
0  japan's chief cabinet secretary Yoshihide suga...               3   
1  syria official mama climbed to the top of the ...               0   
2  The employee in agency canvas City office is a...               2   
3  NEW A canada doctor says she was part of a tea...               3   
4  NEW 4 groups announce legal challenge in Phoen...               3   

                                         topic_label  
0                     Popular Culture and Key Events  
1                                Middle East Affairs  
2  Domestic and Social Issues in China and Australia  
3                     Popular Culture and Key Events  
4                     Popular Culture and Key Events  


Perplexity and Coherence scores

In [63]:
import numpy as np

# Calculate log likelihood
log_likelihood = lda.score(dt_matrix)

# Calculate perplexity
perplexity = np.exp(-log_likelihood / dt_matrix.sum())
print(f'Perplexity: {perplexity}')

Perplexity: 1544.3355650432766


Below is the code to open the db, preprocess the highlights column. Creating an new column with preprocessed data for gensim's LDA model.

In [143]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk import download

# Downloading the wordnet dictionary and stopwords corpus from nltk
download('stopwords')
download('wordnet')

# open the connection to the data base
conn = sqlite3.connect(db_path)

# Loading the data base of preprocessed text
query = "SELECT * FROM articles"
df_corpus = pd.read_sql_query(query, conn)

# Close the connection after loading the data
conn.close()

# create instances for stop_words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Custom method to process the text for gensim LDA model
def preprocess(text):
    
    tokens = simple_preprocess(text) # tokenzie
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # lemmatize and form a list of tokens
    return tokens

# Using the custom function on the data frame with clean text
df_corpus['processed_text'] = df_corpus['highlights'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Creating the Dictonary as processed text field is passed to it. Corpus is the creation of the bag of words that will help the model to process the topics from the text provided.

In [144]:
# Creating the dictionary for gensim and the bag of words data corpus
dictionary_gensim = Dictionary(df_corpus['processed_text'])
corpus = [dictionary_gensim.doc2bow(text) for text in df_corpus['processed_text']]


# Training the lda model on the corpus and the created word_id dictionary from bag of words
lda_model_gensim = LdaModel(corpus=corpus, id2word=dictionary_gensim, num_topics=11, random_state=0, passes=12, alpha=0.5, eta=0.01)# hyper tunning done on number of topics, number of passes, alpha and eta values


Evaluation on the Gensim's LDA model. Calculating the negative perplexity and the coherence scores

In [145]:
# import for the coherence evaluation
from gensim.models import CoherenceModel

# Calculation of the coherence scores on hte gensim_lda model
coherence_model_lda = CoherenceModel(model=lda_model_gensim, texts=df_corpus['processed_text'], dictionary=dictionary_gensim, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_score}')

# Calculating the Perplexity of the lda model gensim
perplexity = lda_model_gensim.log_perplexity(corpus)
print(f'Perplexity: {perplexity}')



Coherence Score: 0.4369186770548062
Perplexity: -9.58770659710414


Printing the Gensim, model topics

In [147]:
# Print topics in a cleaner format
for idex_topic, topic in lda_model_gensim.show_topics(formatted=False, num_words=10):
    print(f"Topic {idex_topic}: ", ", ".join([word for word, prob in topic]))


Topic 7:  say, mama, u, new, president, syria, iraq, si, drug, attack
Topic 10:  john, inland, world, plan, month, hit, australia, cup, first, player
Topic 0:  say, police, christian, monk, history, man, family, said, turkey, situation
Topic 2:  say, told, woman, could, young, got, school, group, hospital, presidential
Topic 9:  say, one, president, year, two, doctor, mama, video, city, last
Topic 4:  claim, say, also, one, since, case, could, police, people, life
Topic 3:  new, people, say, attack, killed, sanction, go, white, bank, u
Topic 8:  say, like, new, u, first, xu, china, people, bone, euro
Topic 5:  say, child, united, new, night, time, bone, service, goal, draw
Topic 1:  new, year, america, shake, besiktas, say, law, first, iran, gunner


In [148]:
import pandas as pd
import sqlite3

# Retrieve the topic distribution for each document
doc_topics_gensim = [lda_model_gensim.get_document_topics(bow) for bow in corpus]

# Extract the max probable topics and assign them to the documents
df_corpus['assigned_topic'] = [max(doc, key=lambda x: x[1])[0] for doc in doc_topics_gensim]

# These are the custom labels for the topics - will be assigned across the 200 documents
topic_labels = {
    0: "Religion and Cultural Affairs",
    1: "International Relations and Politics",
    2: "Social Issues and Health",
    3: "Conflict and Security",
    4: "Crime and Legal Proceedings",
    5: "Sports and Events",
    6: "Political Leadership and Governance",
    7: "Law Enforcement and Public Safety",
    8: "China and Global Affairs",
    9: "Politics and Society",
    10: "Sports and International Competitions"
}

# Numeric label map to the custom labels created and assigned above
df_corpus['topic_label'] = df_corpus['assigned_topic'].map(topic_labels)

# Show few topics as validation for the assignment
print(df_corpus[['highlights', 'assigned_topic', 'topic_label']].head())

# Removign the process_text columns from the data frame
df_corpus_filtered = df_corpus.drop(columns=['processed_text'], errors='ignore')

# Creating the data base and the table to store the processed data frame
conn = sqlite3.connect('Modeler_output_topics_gensim.db')
df_corpus_filtered.to_sql('topic_data_gensim', conn, if_exists='replace', index=False)

# Data base verification on the data insertion
query = "SELECT highlights, assigned_topic, topic_label FROM topic_data_gensim LIMIT 5;"
df_verification = pd.read_sql(query, conn)
print(df_verification)

# Close the database connection
conn.close()


                                          highlights  assigned_topic  \
0  japan's chief cabinet secretary Yoshihide suga...               7   
1  syria official mama climbed to the top of the ...               7   
2  The employee in agency canvas City office is a...               8   
3  NEW A canada doctor says she was part of a tea...               5   
4  NEW 4 groups announce legal challenge in Phoen...               1   

                            topic_label  
0     Law Enforcement and Public Safety  
1     Law Enforcement and Public Safety  
2              China and Global Affairs  
3                     Sports and Events  
4  International Relations and Politics  
                                          highlights  assigned_topic  \
0  japan's chief cabinet secretary Yoshihide suga...               7   
1  syria official mama climbed to the top of the ...               7   
2  The employee in agency canvas City office is a...               8   
3  NEW A canada doctor says

In [150]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('Modeler_output_topics_gensim.db')

# Query to get the unique topics in the 'assigned_topic' column
query = "SELECT DISTINCT assigned_topic FROM topic_data_gensim;"
unique_topics = pd.read_sql(query, conn)

# Display the unique topics to verify all 11 are present
print("Unique topics in the database:")
print(unique_topics)

# Optionally, you can also check topic labels
query_labels = "SELECT DISTINCT topic_label FROM topic_data_gensim;"
unique_labels = pd.read_sql(query_labels, conn)

print("\nUnique topic labels in the database:")
print(unique_labels)

# Close the database connection
conn.close()


Unique topics in the database:
    assigned_topic
0                7
1                8
2                5
3                1
4                3
5               10
6                6
7                4
8                2
9                0
10               9

Unique topic labels in the database:
                              topic_label
0       Law Enforcement and Public Safety
1                China and Global Affairs
2                       Sports and Events
3    International Relations and Politics
4                   Conflict and Security
5   Sports and International Competitions
6     Political Leadership and Governance
7             Crime and Legal Proceedings
8                Social Issues and Health
9           Religion and Cultural Affairs
10                   Politics and Society
