In [None]:
%%html
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>


# **Reduksi Dimensi (LDA)**

## **Apa itu LDA?**  
LDA merupakan sebuah model probabilitas generatif dari koleksi data diskrit seperti korpus teks.  

> $$P(z_d = k, w_d = i) = \frac{\gamma_k \cdot p(w_i | z_k)}{\sum_{k'=1}^K \gamma_{k'} \cdot p(w_i | z_{k'})} $$
  
- P(zd=k,w d =i) adalah probabilitas bahwa dokumen d memiliki label topik k dan kata i muncul di dokumen tersebut
- zd adalah label topik dari dokumen d  
- wd adalah kata-kata yang muncul dalam dokumen d  
- i adalah indeks dari kata wi
- K adalah jumlah topik yang diasumsikan\
- γk adalah probabilitas bahwa dokumen dialokasikan ke topik k
- p(wi∣zk) adalah probabilitas bahwa kata w i muncul dalam topik k

## **Tahapan Implementasi LDA**

### Import library yang diperlukan

In [None]:
import numpy as np
import sklearn
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import os

### Download/siapkan data

In [None]:
import gdown
# download data
# https://drive.google.com/file/d/1NjjX22Qcrg4YPjvTm__NaZhDq4liNm6l/view?usp=drive_link
nama_data = '/content/termFreq_Data'
gdown.download(f'https://drive.google.com/uc?id=1NjjX22Qcrg4YPjvTm__NaZhDq4liNm6l', nama_data, quiet=False)

In [None]:
import gdown
# download data
# https://drive.google.com/file/d/1NjjX22Qcrg4YPjvTm__NaZhDq4liNm6l/view?usp=drive_link
nama_data = '/content/DataAsli'
gdown.download(f'https://drive.google.com/uc?id=1-IdWAIZ16LN3AqRyoKhrv3Iojggt7hDf', nama_data, quiet=False)

### Tampilkan data

In [None]:
termFreq = pd.read_csv("/content/termFreq_Data")
termFreq

In [None]:
# menampilkan data asli
dataset = pd.read_csv("/content/DataAsli")
dataset

In [None]:
# # mengambil kolom selain kolom pertama untuk mendapatkan kata dari abstrak
# termFreq_proses = termFreq.iloc[:, 1:]
# termFreq_proses

In [None]:
# ambil label
datalabel = dataset['Label']
datalabel

### **Proses Latent Dirichlet Allocation (LDA)**

In [None]:
# membuat variable k, alpha dan beta untuk proses LDA
k = 3
alpha = 0.1
beta = 0.2

In [None]:
lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)

In [None]:
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(termFreq)

In [None]:
proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1','Topik 2','Topik 3'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

In [None]:
df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

In [None]:
df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3'],inplace = True)
# Cek kembali nilai NaN
df_final

In [None]:
# Proporsi kata pada topik
fitur = termFreq.columns.tolist()
ProporsiKataTopik = lda_model.components_
ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
ProporsiKataTopik_df

In [None]:
#Metrics
from sklearn.metrics import make_scorer, accuracy_score,precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score
from sklearn.model_selection import KFold,train_test_split,cross_val_score

from sklearn.naive_bayes import GaussianNB

In [None]:
#Train and Test split
X = df_final.iloc[:,:3]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=0)
X

In [None]:
nbT3 = GaussianNB()
nbT3.fit(X_train, y_train)
Y_prediction = nbT3.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb3 = round(nbT3.score(X_train, y_train) * 100, 2)
acc_nb3


4 topik

In [None]:
# 4 topik
# membuat variable k, alpha dan beta untuk proses LDA

k = 4
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(termFreq)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3','Topik 4'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3','Topik 4'],inplace = True)
# Cek kembali nilai NaN
df_final

#Train and Test split
X = df_final.iloc[:,:4]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=0)
X

nbT4 = GaussianNB()
nbT4.fit(X_train, y_train)
Y_prediction = nbT4.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb4 = round(nbT4.score(X_train, y_train) * 100, 2)
print(acc_nb4 )
accuracy_rf


In [None]:
# 6 topik
# membuat variable k, alpha dan beta untuk proses LDA

k = 6
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(termFreq)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3','Topik 4','Topik 5','Topik 6'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3','Topik 4','Topik 5','Topik 6'],inplace = True)
# Cek kembali nilai NaN
df_final

#Train and Test split
X = df_final.iloc[:,:6]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=0)
print(X)

nbT6 = GaussianNB()
nbT6.fit(X_train, y_train)
Y_prediction = nbT6.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb = round(nbT6.score(X_train, y_train) * 100, 2)
acc_nb

In [None]:
nbT7 = GaussianNB()
nbT7.fit(X_train, y_train)
Y_prediction = nbT7.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb = round(nbT7.score(X_train, y_train) * 100, 2)
acc_nb

In [None]:
# TFIDf
import gdown
# download data
# https://drive.google.com/file/d/1BdWsXHxyJ7hP054ZKM0Cv3if3BE222MT/view?usp=sharing
nama_data = '/content/tfidf_Data'
gdown.download(f'https://drive.google.com/uc?id=1BdWsXHxyJ7hP054ZKM0Cv3if3BE222MT', nama_data, quiet=False)

In [None]:
tfidf = pd.read_csv("/content/tfidf_Data")
tfidf

In [None]:
# ===================== tfidf 3 topik =================
# 3 topik
# membuat variable k, alpha dan beta untuk proses LDA

k = 3
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:3]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT4 = GaussianNB()
tfidf_nbT4.fit(X_train, y_train)
Y_prediction = tfidf_nbT4.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb3 = round(tfidf_nbT4.score(X_train, y_train) * 100, 2)


# KNN
from sklearn.neighbors import KNeighborsClassifier

knn3 = KNeighborsClassifier(n_neighbors = 3)
knn3.fit(X_train, y_train)
Y_pred = knn3.predict(X_test)
accuracy_knn=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn3 = round(knn3.score(X_train, y_train) * 100, 2)

print(f'''
tfidf menggunakan 3 topik dan Nb akurasi = {acc_nb3}
tfidf menggunakan 3 topik dan KNN akurasi = {acc_knn3}
''')

In [None]:
df_final.isna().sum()

In [None]:
# ===================== tfidf 4 topik =================

# membuat variable k, alpha dan beta untuk proses LDA

k = 4
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:4]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT4 = GaussianNB()
tfidf_nbT4.fit(X_train, y_train)
Y_prediction = tfidf_nbT4.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb4 = round(tfidf_nbT4.score(X_train, y_train) * 100, 2)

# KNN
from sklearn.neighbors import KNeighborsClassifier

knn4 = KNeighborsClassifier(n_neighbors = 3)
knn4.fit(X_train, y_train)
Y_pred = knn4.predict(X_test)
accuracy_knn4=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn4 = round(knn4.score(X_train, y_train) * 100, 2)

print(f'''
TFIDF menggunakan 4 topik dan Nb akurasi = {acc_nb4}
TFIDF menggunakan 4 topik dan KNN akurasi = {acc_knn4}
''')


In [None]:
# ===================== tfidf 5 topik =================

# membuat variable k, alpha dan beta untuk proses LDA

k = 5
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:5]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT5 = GaussianNB()
tfidf_nbT5.fit(X_train, y_train)
Y_prediction = tfidf_nbT5.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb5 = round(tfidf_nbT5.score(X_train, y_train) * 100, 2)

# KNN
from sklearn.neighbors import KNeighborsClassifier

knn5 = KNeighborsClassifier(n_neighbors = 3)
knn5.fit(X_train, y_train)
Y_pred = knn5.predict(X_test)
accuracy_knn5=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn5 = round(knn5.score(X_train, y_train) * 100, 2)

print(f'''
TFIDF menggunakan 5 topik dan Nb akurasi = {acc_nb4}
TFIDF menggunakan 5 topik dan KNN akurasi = {acc_knn5}
''')


In [None]:
# ===================== tfidf 6 topik =================

# membuat variable k, alpha dan beta untuk proses LDA

k = 6
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:6]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT6 = GaussianNB()
tfidf_nbT6.fit(X_train, y_train)
Y_prediction = tfidf_nbT6.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb6 = round(tfidf_nbT6.score(X_train, y_train) * 100, 2)

# KNN
from sklearn.neighbors import KNeighborsClassifier

knn6 = KNeighborsClassifier(n_neighbors = 3)
knn6.fit(X_train, y_train)
Y_pred = knn6.predict(X_test)
accuracy_knn6=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn6 = round(knn6.score(X_train, y_train) * 100, 2)

print(f'''
TFIDF menggunakan 6 topik dan Nb akurasi = {acc_nb6}
TFIDF menggunakan 6 topik dan KNN akurasi = {acc_knn6}
''')


In [None]:
# ===================== tfidf 7 topik =================

# membuat variable k, alpha dan beta untuk proses LDA

k = 7
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6','Topik 7'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6','Topik 7'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:7]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT7 = GaussianNB()
tfidf_nbT7.fit(X_train, y_train)
Y_prediction = tfidf_nbT7.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb7 = round(tfidf_nbT7.score(X_train, y_train) * 100, 2)

# KNN
from sklearn.neighbors import KNeighborsClassifier

knn7 = KNeighborsClassifier(n_neighbors = 3)
knn7.fit(X_train, y_train)
Y_pred = knn7.predict(X_test)
accuracy_knn7=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn7 = round(knn7.score(X_train, y_train) * 100, 2)

print(f'''
TFIDF menggunakan 7 topik dan Nb akurasi = {acc_nb7}
TFIDF menggunakan 7 topik dan KNN akurasi = {acc_knn7}
''')


In [None]:
# ===================== tfidf 8 topik =================

# membuat variable k, alpha dan beta untuk proses LDA

k = 8
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6','Topik 7','Topik 8'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6','Topik 7','Topik 8'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:8]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT8 = GaussianNB()
tfidf_nbT8.fit(X_train, y_train)
Y_prediction = tfidf_nbT8.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb8 = round(tfidf_nbT8.score(X_train, y_train) * 100, 2)

# KNN
from sklearn.neighbors import KNeighborsClassifier

knn8 = KNeighborsClassifier(n_neighbors = 3)
knn8.fit(X_train, y_train)
Y_pred = knn8.predict(X_test)
accuracy_knn8=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn8 = round(knn8.score(X_train, y_train) * 100, 2)

print(f'''
TFIDF menggunakan 8 topik dan Nb akurasi = {acc_nb8}
TFIDF menggunakan 8 topik dan KNN akurasi = {acc_knn8}
''')


In [None]:
# ===================== tfidf 8 topik =================

# membuat variable k, alpha dan beta untuk proses LDA

k = 8
alpha = 0.1
beta = 0.2

lda_model = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)
# Proporsi topik pada dokumen
proporsi_topik_dokumen = lda_model.fit_transform(tfidf)

proporsi_topik_dokumen_df = pd.DataFrame(proporsi_topik_dokumen, columns=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6','Topik 7','Topik 8'])
# proporsi_topik_dokumen_df.insert(0,'stemmed_tokens', abstrak)
proporsi_topik_dokumen_df

df_final = pd.concat([proporsi_topik_dokumen_df,datalabel],axis=1)
df_final

df_final.isna()
df_final.dropna(subset=['Topik 1', 'Topik 2', 'Topik 3', 'Topik 4', 'Topik 5', 'Topik 6','Topik 7','Topik 8'],inplace = True)
# Cek kembali nilai NaN
df_final

# # Proporsi kata pada topik
# fitur = tfidf.columns.tolist()
# ProporsiKataTopik = lda_model.components_
# ProporsiKataTopik_df = pd.DataFrame(ProporsiKataTopik, columns = fitur)
# ProporsiKataTopik_df.insert(0,'Topik',['Topik 1', 'Topik 2', 'Topik 3'])
# ProporsiKataTopik_df

#Train and Test split
X = df_final.iloc[:,:8]
y = df_final['Label']
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)
print(X)

tfidf_nbT8 = GaussianNB()
tfidf_nbT8.fit(X_train, y_train)
Y_prediction = tfidf_nbT8.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_nb8 = round(tfidf_nbT8.score(X_train, y_train) * 100, 2)
print(f'''
TFIDF menggunakan 6 topik dan Nb akurasi = {acc_nb8}
''')


In [None]:
import matplotlib.pyplot as plt

# Contoh data akurasi (ganti dengan data akurasi Anda)
hasil_akurasi = [acc_nb3, acc_nb4, acc_nb5, acc_nb6, acc_nb7, acc_nb8]

# Nomor iterasi atau label untuk setiap hasil akurasi (misalnya, iterasi 1, iterasi 2, dst.)
nomor_iterasi = ['acc_nb3', 'acc_nb4', 'acc_nb5', 'acc_nb6', 'acc_nb7', 'acc_nb8']

# Membuat plot
plt.plot(nomor_iterasi, hasil_akurasi, marker='o', linestyle='-')

# Menambahkan label sumbu x dan y
plt.xlabel('Hasil model')
plt.ylabel('Akurasi')

# Menambahkan judul plot
plt.title('Grafik Akurasi Hasil Data')

# Menampilkan grid (opsional)
plt.grid(True)

# Menampilkan plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Hasil akurasi Model Naive Bayes dan Model KNN (ganti dengan hasil akurasi Anda)
akurasi_nb = [acc_nb3, acc_nb4, acc_nb5, acc_nb6, acc_nb7, acc_nb8]  # Contoh hasil akurasi NB
akurasi_knn = [acc_knn3, acc_knn4, acc_knn5, acc_knn6, acc_knn7, acc_knn8]  # Contoh hasil akurasi KNN

# Nomor iterasi atau label untuk setiap hasil akurasi (misalnya, iterasi 1, iterasi 2, dst.)
nomor_iterasi = range(3,9)

# Lebar batang dalam diagram
lebar_batang = 0.2
# Membuat posisi batang untuk kedua model
posisi_nb = np.arange(len(nomor_iterasi))
posisi_knn = [x + lebar_batang for x in posisi_nb]

# Mengatur ukuran gambar
plt.figure(figsize=(10, 6))

# Membuat plot
plt.bar(posisi_nb, akurasi_nb, lebar_batang, label='Naive Bayes', color='blue')
plt.bar(posisi_knn, akurasi_knn, lebar_batang, label='K-Nearest Neighbors', color='green')

# Menambahkan label sumbu x dan y
plt.xlabel('Iterasi')
plt.ylabel('Akurasi')

# Menambahkan judul plot
plt.title('Perbandingan Akurasi Model Naive Bayes dan K-Nearest Neighbors')

# Menambahkan label pada sumbu x (menggunakan nomor iterasi)
plt.xticks([pos + lebar_batang / 2 for pos in posisi_nb], nomor_iterasi)

# Menampilkan legenda
plt.legend()

# Menampilkan plot
plt.show()


In [None]:
# Menentukan model dengan akurasi tertinggi
max_akurasi_nb = max(akurasi_nb)
max_akurasi_knn = max(akurasi_knn)

if max_akurasi_nb > max_akurasi_knn:
    model_tertinggi = "Naive Bayes"
    akurasi_tertinggi = max_akurasi_nb
else:
    model_tertinggi = "K-Nearest Neighbors"
    akurasi_tertinggi = max_akurasi_knn

# Menampilkan model dengan akurasi tertinggi beserta nilai akurasinya
print(f"Model dengan akurasi tertinggi adalah {model_tertinggi} dengan akurasi {akurasi_tertinggi:.2f}")