## Import Packages

In [1]:
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

## Reddit Data Loading

In [2]:
# Read Kaggle Dataset
df = pd.read_csv("reddit_database.csv")

## Data Preprocessing

In [3]:
# Use reddit post title and content 
df = df["title"].append(df["post"])

In [4]:
# exclude stopwords
stop = set(stopwords.words("english"))

# exclude punctuation marks
punctuation = set(string.punctuation)

# perform lemmatization
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # cast text to string type and lowercase
    text = " ".join([i for i in str(text).lower().split() if i not in stop])
    
    text = "".join(c for c in text if c not in punctuation)
    
    text = " ".join(lemmatizer.lemmatize(w) for w in text.split())
    
    return text

# clean text
text = [clean_text(d).split() for d in df]

## Feature Engineering: TF-IDF

In [5]:
tf_idf_vec = TfidfVectorizer(tokenizer=lambda d : d, lowercase=False)

# count_vec = CountVectorizer(tokenizer=lambda d : d, lowercase=False)

In [6]:
tf_idf_out = tf_idf_vec.fit_transform(text)

# count_vec_out = count_vec.fit_transform(text)

In [7]:
# Generate Vocabulary
vocab = tf_idf_vec.get_feature_names_out()

## Topic Modeling: LDA

In [8]:
model = LatentDirichletAllocation(n_components=6, max_iter=10, random_state=20)

topics = model.fit_transform(tf_idf_out)

# Get Topic Distribution
topic_dist = model.components_

## Topic Words

In [9]:
# Number of Topics
n = 5

for i, td in enumerate(topic_dist):
    sorted_topic_indices = np.argsort(td)
    
    topic_words = np.array(vocab)[sorted_topic_indices]
    
    topic_words = topic_words[:-n:-1]
    
    print(f"Topic {i+1}: {topic_words}")

Topic 1: ['nan' 'tutorhelpdeskcom' 'thecodingpie' 'beautifulsoup4']
Topic 2: ['learning' 'machine' 'deep' 'd']
Topic 3: ['data' 'im' 'would' 'model']
Topic 4: ['artificial' 'intelligence' 'ai' 'detection']
Topic 5: ['regression' 'test' 'variable' 'question']
Topic 6: ['data' 'science' 'job' 'learning']


## Document Topics

In [16]:
d_topic = model.transform(tf_idf_out)

for i in range(d_topic[:10, :].shape[0]):
    topic_d = d_topic[i].argmax()
    
    print(f"Document {i+1}: Topic: {topic_d}")

Document 1: Topic: 2
Document 2: Topic: 2
Document 3: Topic: 5
Document 4: Topic: 2
Document 5: Topic: 2
Document 6: Topic: 2
Document 7: Topic: 2
Document 8: Topic: 5
Document 9: Topic: 2
Document 10: Topic: 2
