Getting the data from corpus and building the dataframe

In [2]:
# panda and sqlite import
import sqlite3
import pandas as pd

# data base path
db_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/processed_articles.db'

# open the connection to the data base
conn = sqlite3.connect(db_path)

# create the pandas data frame with full table data
query = "SELECT * FROM articles"
df_corpus = pd.read_sql_query(query, conn)

# Close the connection after loading the data
conn.close()

Custom method to extract the word frequency vectors from highlights text.

Tunning done on how often the word occurs and how least it occurs, eliminating the outliers from the topic modeling consideration.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# custom function to process the text and vectorize 
def preprocess_text(text_corpus):
    # Addition words to ignore
    custom_stop_words = ['day', 'say', 'says', 'time', 'got', 'new', 'week','said']  # Replace with the words you want to remove

    # Combining them with default English stop words
    combined_stop_words = set(custom_stop_words).union(set(CountVectorizer(stop_words='english').get_stop_words()))

    # Create the vercorizer
    vectorizer = CountVectorizer(max_df=0.91, min_df=2, stop_words=list(combined_stop_words)) # parameter tunning of max and minimum frequencies
    dt_matrix = vectorizer.fit_transform(text_corpus)
    return dt_matrix, vectorizer

# generating the document frequncy model and getting the learned vectorizer on vocabulary
dt_matrix, vectorizer = preprocess_text(df_corpus['highlights'])


Using the word vectors to implement the LDA algorithm to find the best words to model the topics

In [4]:
from sklearn.decomposition import LatentDirichletAllocation

# Creating the LDA object
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=123) # number of topics and random state set for easy tunning, same results on different parameters

# Fit the LDA model to the document-term matrix and retrieve the topic distribution
lda.fit(dt_matrix)
lda_topic_dist = lda.transform(dt_matrix)





Iterating throught the topics and displaying them. Sorted in as low to high value. Top 5 are the last 5 topic word distribution rows.

In [5]:
# Custom function to display topics and their top words
def display_topics(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}: ", [feature_names[i] for i in topic.argsort()[-num_top_words:]])

# Display topics
display_topics(lda, vectorizer.get_feature_names_out())



Topic 1:  ['include', 'told', 'police', 'suicide', 'house', 'iraq', 'people', 'syria', 'president', 'mama']
Topic 2:  ['hits', 'history', 'world', 'official', 'america', 'united', 'children', 'died', 'police', 'people']
Topic 3:  ['christian', 'world', 'women', 'names', 'just', 'inland', 'baby', 'long', 'australia', 'year']
Topic 4:  ['president', 'fans', 'couple', 'rock', 'games', 'reported', 'shake', 'seen', 'white', 'young']
Topic 5:  ['players', 'today', 'mama', 'china', 'law', 'left', 'president', 'security', 'america', 'people']


Assigning topic numbers and custom labels for furthur processing.

In [6]:
# Assigning a topic out of the 5 modeled
df_corpus['assigned_topic'] = lda_topic_dist.argmax(axis=1)

# Custom labels for the topic document probabilities
topic_labels = {
    0: "Middle East and Politics",
    1: "Historical Events and Police Reports",
    2: "Christianity and Culture",
    3: "Entertainment and Youth",
    4: "Sports and Geopolitics",
}

# Map for the numeric to text form of topics
df_corpus['topic_label'] = df_corpus['assigned_topic'].map(topic_labels)

# Sample head display
df_corpus[['highlights', 'assigned_topic', 'topic_label']].head()


Unnamed: 0,highlights,assigned_topic,topic_label
0,japan's chief cabinet secretary Yoshihide suga...,4,Sports and Geopolitics
1,syria official mama climbed to the top of the ...,0,Middle East and Politics
2,The employee in agency canvas City office is a...,0,Middle East and Politics
3,NEW A canada doctor says she was part of a tea...,4,Sports and Geopolitics
4,NEW 4 groups announce legal challenge in Phoen...,4,Sports and Geopolitics


In [9]:
import sqlite3

# create the new Database with topics
conn = sqlite3.connect('Modeler_output_topics.db')

# Data frame to data base convertion
df_corpus.to_sql('topic_data', conn, if_exists='replace', index=False)

# Validation of the data inserted
query = "SELECT highlights, assigned_topic, topic_label FROM topic_data LIMIT 5;"
df_verification = pd.read_sql(query, conn)
print(df_verification)

# Close connection
conn.close()


                                          highlights  assigned_topic  \
0  japan's chief cabinet secretary Yoshihide suga...               4   
1  syria official mama climbed to the top of the ...               0   
2  The employee in agency canvas City office is a...               0   
3  NEW A canada doctor says she was part of a tea...               4   
4  NEW 4 groups announce legal challenge in Phoen...               4   

                topic_label  
0    Sports and Geopolitics  
1  Middle East and Politics  
2  Middle East and Politics  
3    Sports and Geopolitics  
4    Sports and Geopolitics  


In [8]:
# modeler evalution and possible hybrid approach application