In [None]:
# --- 1. Import Library ---
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')
from nltk.corpus import stopwords

# --- 2. Load Dataset ---
df = pd.read_csv("/content/drive/MyDrive/PPW/artikel_medium_multitopik_rss.csv")  # sesuaikan path di Colab
print("Jumlah data:", len(df))
print(df.head())

# --- 3. Preprocessing ---
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # hapus karakter non-huruf
    tokens = [word for word in text.split() if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

df["clean_text"] = df["isi"].apply(clean_text)

# --- 4. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["kategori"],
                                                    test_size=0.2, random_state=42, stratify=df["kategori"])

# --- 5. Ubah ke representasi count untuk LDA ---
vectorizer = CountVectorizer(max_features=3000)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# --- 6. Latent Dirichlet Allocation (LDA) untuk ekstraksi fitur ---
n_topics = 10  # jumlah topik tersembunyi
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
X_train_lda = lda.fit_transform(X_train_counts)
X_test_lda = lda.transform(X_test_counts)

print("Dimensi hasil LDA:", X_train_lda.shape)

# --- 7. Klasifikasi (Naive Bayes) ---
model = MultinomialNB()
model.fit(X_train_lda, y_train)
y_pred = model.predict(X_test_lda)

# --- 8. Evaluasi ---
print("\n=== HASIL EVALUASI ===")
print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --- 9. Contoh Prediksi Baru ---
contoh = ["AI helps doctors detect diseases early"]
contoh_counts = vectorizer.transform(contoh)
contoh_lda = lda.transform(contoh_counts)
prediksi = model.predict(contoh_lda)
print("\nContoh Prediksi:", prediksi[0])
