Unsupervised learning

Topic Modeling dengan LDA (Latent Dirichlet Allocation)

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load data
df = pd.read_csv("Combined Data.csv")
df = df.dropna(subset=["statement"])

# Preprocessing sederhana
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [word for word in text if word not in CountVectorizer(stop_words='english').get_stop_words() and len(word) > 2]
    return ' '.join(text)

df['clean_text'] = df['statement'].apply(preprocess)

# Vectorisasi
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(df['clean_text'])

# LDA
lda = LatentDirichletAllocation(n_components=8, random_state=42)
lda.fit(doc_term_matrix)

# Tampilkan topik
for idx, topic in enumerate(lda.components_):
    print(f"Topik #{idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print()


Topik #1:
['know', 'pain', 'really', 'started', 'time', 'don', 'feel', 'like', 'just', 'anxiety']

Topik #2:
['working', 'college', 'year', 'money', 'don', 'just', 'school', 'time', 'work', 'job']

Topik #3:
['depression', 'body', 'eat', 'want', 'sleep', 'amp', 'just', 'feel', 'day', 'like']

Topik #4:
['diagnosed', 'know', 'people', 'health', 'therapy', 'mental', 'bipolar', 'anxiety', 'help', 'depression']

Topik #5:
['know', 'just', 'got', 'parents', 'did', 'time', 'year', 'family', 'years', 'life']

Topik #6:
['know', 'like', 'time', 'really', 'got', 'told', 'friend', 'said', 'did', 'just']

Topik #7:
['going', 'die', 'fucking', 'feel', 'like', 'know', 'anymore', 'life', 'want', 'just']

Topik #8:
['want', 'life', 'think', 'really', 'don', 'know', 'people', 'just', 'feel', 'like']



In [5]:
!pip install gensim




In [None]:
import pandas as pd
import re
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from nltk.corpus import stopwords
import nltk

# Download stopwords jika belum
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load data
df = pd.read_csv("Combined Data.csv")
df = df.dropna(subset=["statement"])

# 2. Preprocessing
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Hapus simbol dan angka
    text = text.lower().split()  # Lowercase + split
    text = [word for word in text if word not in stop_words and len(word) > 2]
    return text

# 3. Tokenisasi
tokenized_docs = df['statement'].apply(preprocess).tolist()

# 4. Buat dictionary dan corpus
id2word = corpora.Dictionary(tokenized_docs)
corpus = [id2word.doc2bow(doc) for doc in tokenized_docs]

# 5. Latih LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=8, random_state=42, passes=20)

# 6. Tampilkan topik
for i, topic in lda_model.show_topics(formatted=False, num_words=10):
    print(f"Topik #{i+1}: {[word for word, _ in topic]}")
print()

# 7. Hitung Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Topik #1: ['want', 'life', 'day', 'nothing', 'hate', 'tired', 'anything', 'going', 'live', 'everything']
Topik #2: ['amp', 'experience', 'https', 'either', 'physical', 'com', 'gonna', 'losing', 'wanna', 'exhausted']
Topik #3: ['work', 'years', 'time', 'year', 'job', 'get', 'life', 'ago', 'school', 'back']
Topik #4: ['mom', 'put', 'got', 'said', 'dad', 'kill', 'house', 'energy', 'reading', 'death']
Topik #5: ['bipolar', 'take', 'fucking', 'manic', 'effects', 'inside', 'realized', 'lamictal', 'lithium', 'vent']
Topik #6: ['anxiety', 'day', 'get', 'also', 'got', 'feeling', 'stress', 'time', 'started', 'back']
Topik #7: ['help', 'depression', 'might', 'please', 'need', 'deal', 'guess', 'depressed', 'self', 'due']
Topik #8: ['like', 'feel', 'know', 'people', 'even', 'really', 'think', 'things', 'would', 'get']

Coherence Score: 0.3459


In [None]:
import pandas as pd
import re
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import STOPWORDS

# Load data
df = pd.read_csv("Combined Data.csv")
df = df.dropna(subset=["statement"])

# Preprocessing function
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    return [word for word in text if word not in STOPWORDS and len(word) > 2]

# Tokenize
tokenized_docs = df['statement'].apply(preprocess).tolist()

# Buat bigram dan trigram
bigram = Phrases(tokenized_docs, min_count=5, threshold=80)
trigram = Phrases(bigram[tokenized_docs], threshold=80)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Transformasi ke bigram dan trigram
data_words_trigrams = [trigram_mod[bigram_mod[doc]] for doc in tokenized_docs]

# Buat dictionary dan corpus
id2word = corpora.Dictionary(data_words_trigrams)
corpus = [id2word.doc2bow(text) for text in data_words_trigrams]

# Latih model LDA
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=8, random_state=42, passes=20)

# Hitung coherence score
coherence_model = CoherenceModel(model=lda_model, texts=data_words_trigrams, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()

# Tampilkan topik
for i, topic in lda_model.show_topics(formatted=False, num_words=10):
    print(f"Topik #{i+1}: {[word for word, _ in topic]}")
print(f"\nCoherence Score: {coherence_score:.4f}")


Topik #1: ['amp', 'saying', 'idk', 'health', 'energy', 'appointment', 'study', 'write', 'completely', 'suffering']
Topik #2: ['talk', 'school', 'friend', 'depressed', 'pretty', 'mom', 'shit', 'help', 'depression', 'parents']
Topik #3: ['therapy', 'wish', 'avpd', 'trying', 'deep', 'look', 'wanna', 'handle', 'cut', 'kid']
Topik #4: ['stress', 'sleep', 'recently', 'constantly', 'literally', 'night', 'high', 'super', 'completely', 'second']
Topik #5: ['anxiety', 'like', 'feeling', 'depression', 'feel', 'anxious', 'help', 'having', 'symptoms', 'bad']
Topik #6: ['like', 'feel', 'know', 'people', 'life', 'think', 'time', 'things', 'want', 'way']
Topik #7: ['work', 'got', 'time', 'job', 'year', 'ago', 'went', 'day', 'going', 'told']
Topik #8: ['want', 'anymore', 'life', 'live', 'tired', 'feel', 'like', 'pain', 'die', 'living']

Coherence Score: 0.3803
