# Topic Modeling with Latent Semantic Analysis (LDA)

## Import Library 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from warnings import filterwarnings
filterwarnings('ignore')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

sw_indo = stopwords.words('indonesian')

## Import Data

In [2]:
df = pd.read_csv("data/kompas.csv")
df.head()

Unnamed: 0,teks
0,Ginandjar Tetap Ditahan. Jaksa Agung Dilaporka...
1,Jakarta Dikangkangi Para Preman\nKALAU tak pun...
2,Penyimpangan di Setpres Seolah Terjadi Sekaran...
3,"Dibayarkan, Rapel Kenaikan Gaji Pegawai Pos\nK..."
4,"Stop Kekerasan, Elite agar Duduk Bersama\nSeju..."


## Cleaning

In [3]:
def cleansing(text):
    word_list = word_tokenize(text)
    word_list = [word for word in word_list if len(word) > 1 and word.isalpha()]
    word_list = [word for word in word_list if word not in punctuation]
    word_list = [word for word in word_list if word not in sw_indo]
    text = ' '.join(word_list)
    return text

In [4]:
df['teks'] = df.teks.apply(cleansing)

In [5]:
df.head()

Unnamed: 0,teks
0,Ginandjar Tetap Ditahan Jaksa Agung Dilaporkan...
1,Jakarta Dikangkangi Para Preman KALAU nyali ua...
2,Penyimpangan Setpres Seolah Terjadi Sekarang P...
3,Dibayarkan Rapel Kenaikan Gaji Pegawai Pos Ken...
4,Stop Kekerasan Elite Duduk Bersama Sejumlah in...


## Extract Features

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
bow = CountVectorizer(ngram_range=(1, 2), min_df=5)

In [8]:
bow_matrix = bow.fit_transform(df.teks)

# Topic Modeling

In [9]:
vocab = bow.get_feature_names()

## Latent Semantic Analysis (LSA)

In [10]:
from sklearn.decomposition import TruncatedSVD

In [11]:
lsa = TruncatedSVD(n_components=10, n_iter=20, random_state=42)

In [12]:
lsa_matrix = lsa.fit_transform(bow_matrix)

In [13]:
print(bow_matrix.shape)  # fitur 
print(lsa_matrix.shape)  # hasil reduksi 
print(lsa.components_.shape)  # topic 

(2008, 23113)
(2008, 10)
(10, 23113)


In [14]:
def get_topic(model):
    return [[vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()] for comp in model.components_]

In [15]:
get_topic(lsa)

[['presiden', 'dpr', 'indonesia', 'pemerintah', 'ketua', 'rp'],
 ['presiden', 'dpr', 'ketua', 'partai', 'mpr', 'tandjung'],
 ['air', 'banjir', 'warga', 'jakarta', 'jalan', 'rumah'],
 ['tandjung', 'rp', 'dana', 'bulog', 'hukum', 'akbar'],
 ['harga', 'presiden', 'beras', 'rp', 'bbm', 'kenaikan'],
 ['air', 'banjir', 'dpr', 'jakarta', 'bank', 'bppn'],
 ['bppn', 'pemerintah', 'dpr', 'anggota', 'pkps', 'ukm'],
 ['mpr', 'indonesia', 'konstitusi', 'uud', 'perubahan', 'komisi'],
 ['israel', 'palestina', 'dpr', 'arafat', 'pansus', 'as'],
 ['rupiah', 'massa', 'bunga', 'suku', 'partai']]