## Import Packages

In [1]:
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

## Reddit Data Loading

In [2]:
# read collected reddit dataset
df = pd.read_csv("raw_reddit_news_posts.csv")

## Data Preprocessing

In [3]:
# use reddit post title
df = df["title"]

In [4]:
# exclude stopwords
stop = set(stopwords.words("english"))

# exclude punctuation marks
punctuation = set(string.punctuation)

# perform lemmatization
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # cast text to string type and lowercase
    text = " ".join([i for i in str(text).lower().split() if i not in stop])
    
    text = "".join(c for c in text if c not in punctuation)
    
    text = " ".join(lemmatizer.lemmatize(w) for w in text.split())
    
    return text

# clean text
text = [clean_text(d).split() for d in df]

## Feature Engineering: TF-IDF

In [5]:
tf_idf_vec = TfidfVectorizer(tokenizer=lambda d : d, lowercase=False)

# count_vec = CountVectorizer(tokenizer=lambda d : d, lowercase=False)

In [6]:
tf_idf_out = tf_idf_vec.fit_transform(text)

# count_vec_out = count_vec.fit_transform(text)

In [7]:
# generate vocabulary
vocab = tf_idf_vec.get_feature_names_out()

## Topic Modeling: LDA

In [8]:
model = LatentDirichletAllocation(n_components=6, max_iter=10, random_state=20)

topics = model.fit_transform(tf_idf_out)

# get topic distribution
topic_dist = model.components_

## Topic Words

In [9]:
# number of topics
n = 5

for i, td in enumerate(topic_dist):
    sorted_topic_indices = np.argsort(td)
    
    topic_words = np.array(vocab)[sorted_topic_indices]
    
    topic_words = topic_words[:-n:-1]
    
    print(f"Topic {i+1}: {topic_words}")

Topic 1: ['u' 'china' 'say' 'russia']
Topic 2: ['coronavirus' 'case' 'covid19' 'new']
Topic 3: ['coronavirus' 'trump' 'test' 'china']
Topic 4: ['ukraine' 'u' 'russian' 'russia']
Topic 5: ['vaccine' 'covid19' 'covid' 'coronavirus']
Topic 6: ['china' 'u' 'climate' 'say']


## Document Topics

In [10]:
d_topic = model.transform(tf_idf_out)

for i in range(d_topic[:10, :].shape[0]):
    topic_d = d_topic[i].argmax()
    
    print(f"Document {i+1}: Topic: {topic_d}")

Document 1: Topic: 1
Document 2: Topic: 2
Document 3: Topic: 5
Document 4: Topic: 5
Document 5: Topic: 2
Document 6: Topic: 4
Document 7: Topic: 3
Document 8: Topic: 3
Document 9: Topic: 3
Document 10: Topic: 1
