# LDA Topic Modelling

LDA (Latent Dirichlet Allocation) adalah model generatif yang digunakan dalam pemrosesan bahasa alami untuk mengelompokkan dokumen ke dalam topik-topik tersembunyi. Model ini mengasumsikan bahwa setiap dokumen adalah kombinasi dari beberapa topik, dan setiap kata dalam dokumen dihasilkan dari distribusi probabilitas topik tersebut. LDA umumnya digunakan untuk analisis topik dalam koleksi dokumen besar.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

Membaca data term frequency

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ppw/TermFrequensi.csv')
df

Unnamed: 0,Dokumen,aalysis,aam,ab,abad,abadi,ability,abjad,absensi,absolut,...,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu,Label
0,sistem informasi akademik siakad sistem inform...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RPL
1,berjalannya koneksi jaringan komputer lancar g...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RPL
2,web server perangkat lunak server berfungsi me...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RPL
3,penjadwalan kuliah perguruan kompleks permasal...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
4,seiring perkembangan teknologi didunia muncul ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,kurangnya pemahaman gejala penyakit saluran pe...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
824,data set hilang utama studi bersifat substansi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
825,proses seleksi penerimaan tenaga kerja faktor ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
826,sapi salah hewan ternak komoditi utama bahan p...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK


In [None]:
import numpy as np
kelas_dataset = df['Label']

# Ubah kelas A menjadi 0 dan kelas B menjadi 1
kelas_dataset_binary = [0 if kelas == 'RPL' else 1 for kelas in kelas_dataset]

# Contoh cetak hasilnya
df['Label']=kelas_dataset_binary


In [None]:
y = df['Label']
y

0      0
1      0
2      0
3      1
4      1
      ..
823    1
824    1
825    1
826    1
827    1
Name: Label, Length: 828, dtype: int64

Drop Kolom Dokumen

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
X = df.drop('Dokumen', axis=1)
# Inisialisasi CountVectorizer dengan menghilangkan stop words bahasa Inggris
vectorizer = CountVectorizer(stop_words='english')
# Konversi teks menjadi matriks term frequency
X = vectorizer.fit_transform(df['Dokumen'].values.astype('U'))
X

<828x8818 sparse matrix of type '<class 'numpy.int64'>'
	with 49245 stored elements in Compressed Sparse Row format>

Split data sebelum melakukan LDA

In [None]:
from sklearn.model_selection import train_test_split
# Bagi data yang telah divectorisasi menjadi training set (80%) dan test set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



<166x8818 sparse matrix of type '<class 'numpy.int64'>'
	with 10051 stored elements in Compressed Sparse Row format>

Inisialisasi nilai K, Alpha dan Beta

In [None]:
k = 3
alpha = 0.1
beta = 0.2

lda = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)

# Latih model LDA pada training set yang telah divectorisasi
proporsi_topik_dokumen_train = lda.fit_transform(X_train)

# Proyeksikan dokumen pada test set ke dalam ruang topik yang telah dipelajari
proporsi_topik_dokumen_test = lda.transform(X_test)


Hasil LDA

In [None]:
dokumen = df['Dokumen']
label= df['Label']
output_proporsi_TD = pd.DataFrame(proporsi_topik_dokumen_test, columns=['Topik 1', 'Topik 2', 'Topik 3'])
output_proporsi_TD.insert(0,'Dokumen', dokumen)
output_proporsi_TD.insert(len(output_proporsi_TD.columns),'Label', df['Label'])
output_proporsi_TD

Unnamed: 0,Dokumen,Topik 1,Topik 2,Topik 3,Label
0,sistem informasi akademik siakad sistem inform...,0.076923,0.076923,0.846154,0
1,berjalannya koneksi jaringan komputer lancar g...,0.618778,0.000950,0.380272,0
2,web server perangkat lunak server berfungsi me...,0.090664,0.001346,0.907990,0
3,penjadwalan kuliah perguruan kompleks permasal...,0.306898,0.557601,0.135501,1
4,seiring perkembangan teknologi didunia muncul ...,0.258253,0.210719,0.531028,1
...,...,...,...,...,...
161,perkembangan zaman era globalisasi melepaskan ...,0.919521,0.001311,0.079168,1
162,visualisasi animasi meberikan informasi intera...,0.363820,0.025578,0.610602,0
163,tes prosedur penilaian tespilihan ganda tersed...,0.066254,0.000924,0.932823,1
164,ujian esai evaluasi pembelajaran bentuk esai b...,0.000978,0.019100,0.979923,1


In [None]:
df.columns.shape

(9050,)

Output distribusi kata pada topik

In [None]:
distribusi_kata_topik = pd.DataFrame(lda.components_)
distribusi_kata_topik

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8808,8809,8810,8811,8812,8813,8814,8815,8816,8817
0,0.2,0.2,1.190242,0.2,0.2,0.2,3.2,1.199999,0.2,0.2,...,4.38156,1.2,1.2,0.2,0.2,3.2,4.2,3.191975,0.2,0.2
1,0.2,0.2,0.209758,1.2,0.2,0.2,0.2,4.199812,1.197985,2.19915,...,0.281929,0.2,0.2,0.2,0.206454,0.2,0.2,0.2,0.2,0.2
2,0.2,0.2,0.2,0.2,2.2,1.2,0.2,0.200189,0.202015,0.20085,...,2.936512,0.2,0.2,14.2,8.193546,0.2,0.2,0.208025,1.2,1.2


Cluster

In [None]:
from sklearn.cluster import KMeans
# Melakukan clustering dengan K-Means
X_clustering = proporsi_topik_dokumen_test
n_clusters = 2

kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit_predict(X_clustering)

# Menambahkan hasil clustering ke DataFrame
output_proporsi_TD['Cluster'] = clusters

# Hasil akhir DataFrame
print(output_proporsi_TD)

                                               Dokumen   Topik 1   Topik 2  \
0    sistem informasi akademik siakad sistem inform...  0.076923  0.076923   
1    berjalannya koneksi jaringan komputer lancar g...  0.618778  0.000950   
2    web server perangkat lunak server berfungsi me...  0.090664  0.001346   
3    penjadwalan kuliah perguruan kompleks permasal...  0.306898  0.557601   
4    seiring perkembangan teknologi didunia muncul ...  0.258253  0.210719   
..                                                 ...       ...       ...   
161  perkembangan zaman era globalisasi melepaskan ...  0.919521  0.001311   
162  visualisasi animasi meberikan informasi intera...  0.363820  0.025578   
163  tes prosedur penilaian tespilihan ganda tersed...  0.066254  0.000924   
164  ujian esai evaluasi pembelajaran bentuk esai b...  0.000978  0.019100   
165  kemampuan menulis alphanumerik bekal utama ana...  0.081908  0.186440   

      Topik 3  Label  Cluster  
0    0.846154      0        0  



In [None]:
# Menggabungkan DataFrame hasil LDA dan DataFrame hasil clustering
output_final_df = pd.concat([output_proporsi_TD], axis=1)

output_final_df

Unnamed: 0,Dokumen,Topik 1,Topik 2,Topik 3,Label,Cluster
0,sistem informasi akademik siakad sistem inform...,0.076923,0.076923,0.846154,0,0
1,berjalannya koneksi jaringan komputer lancar g...,0.618778,0.000950,0.380272,0,1
2,web server perangkat lunak server berfungsi me...,0.090664,0.001346,0.907990,0,0
3,penjadwalan kuliah perguruan kompleks permasal...,0.306898,0.557601,0.135501,1,1
4,seiring perkembangan teknologi didunia muncul ...,0.258253,0.210719,0.531028,1,0
...,...,...,...,...,...,...
161,perkembangan zaman era globalisasi melepaskan ...,0.919521,0.001311,0.079168,1,1
162,visualisasi animasi meberikan informasi intera...,0.363820,0.025578,0.610602,0,0
163,tes prosedur penilaian tespilihan ganda tersed...,0.066254,0.000924,0.932823,1,0
164,ujian esai evaluasi pembelajaran bentuk esai b...,0.000978,0.019100,0.979923,1,0


Modelling klasifikasi

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


# Naive Bayes
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
predictions = naive_bayes.predict(X_test)
accuracy = round(accuracy_score(y_test, predictions)*100,2)
accnb = round(naive_bayes.score(X_train,y_train)*100,2)

print("Akurasi Naive Bayes:", accuracy)

# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
predict = knn.predict(X_test)
accuracyknn = round(accuracy_score(y_test,predict)*100,2)
accknn = round(knn.score(X_train,y_train)*100,2)

print("Akurasi KNN :", accknn)


decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
predictions_dt = decision_tree.predict(X_test)
accuracy_dt = round(accuracy_score(y_test, predictions_dt) * 100, 2)
acc_dt = round(decision_tree.score(X_train, y_train) * 100, 2)

print("Akurasi Decision Tree:", accuracy_dt)

Akurasi Naive Bayes: 83.73
Akurasi KNN : 75.83
Akurasi Decision Tree: 70.48
