In [None]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns

from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# ---------------------------
# 1. Load data dan lexicon
# ---------------------------

df = pd.read_csv("hasil_sentimen_pesantren.csv")  # file utama kamu
positive_words = pd.read_csv("positive.csv", header=None)[0].tolist()
negative_words = pd.read_csv("negative.csv", header=None)[0].tolist()

stop_words = set(stopwords.words('indonesian'))

In [None]:
# ---------------------------
# 2. Perbaikan kalimat (dummy)
# ---------------------------

def simple_refine(text):
    # Simulasi perbaikan kalimat: capitalisasi kata pertama, hapus double spasi
    if pd.isna(text):
        return ""
    text = text.strip()
    text = " ".join(text.split())
    return text.capitalize()

df['text_refined'] = df['text_combined'].apply(simple_refine)

In [None]:
# ---------------------------
# 3. Preprocessing untuk LDA
# ---------------------------

def preprocess(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and w.isalpha()]
    return tokens

df['tokens'] = df['text_refined'].apply(preprocess)

# Buat dictionary dan corpus
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]


In [None]:
# ---------------------------
# 4. LDA Topic Modeling untuk aspek
# ---------------------------

num_topics = 10  # sesuai jumlah aspek kamu

lda_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=2, random_state=42)

# Assign topik utama tiap kalimat
def get_main_topic(lda_model, bow):
    topics = lda_model.get_document_topics(bow)
    # ambil topik dengan probabilitas tertinggi
    if len(topics) == 0:
        return -1
    return max(topics, key=lambda x: x[1])[0]

df['predicted_aspek'] = [get_main_topic(lda_model, bow) for bow in corpus]

# Buat mapping aspek (nomor) ke nama aspek (sesuai input user)
aspek_map = {
    0: 'Kualitas Guru',
    1: 'Fasilitas',
    2: 'Lingkungan',
    3: 'Kegiatan Pondok',
    4: 'Pembinaan Karakter',
    5: 'Prestasi',
    6: 'Akademik',
    7: 'Motivasi/Spiritual',
    8: 'Sosial',
    9: 'Umum'
}
df['predicted_aspek_label'] = df['predicted_aspek'].map(aspek_map).fillna('Unknown')


In [None]:
# ---------------------------
# 5. Sentimen Lexicon-Based
# ---------------------------

def lexicon_sentiment(text):
    if pd.isna(text):
        return "netral"
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    pos_count = sum(1 for w in words if w in positive_words)
    neg_count = sum(1 for w in words if w in negative_words)
    if pos_count > neg_count:
        return "positif"
    elif neg_count > pos_count:
        return "negatif"
    else:
        return "netral"

df['predicted_sentimen_lexicon'] = df['text_refined'].apply(lexicon_sentiment)


In [None]:
# ---------------------------
# 6. Evaluasi Akurasi & Confusion Matrix
# ---------------------------

# Asumsi label manual sudah sesuai format
# Untuk aspek
if 'aspek_manual' in df.columns:
    y_true_aspek = df['aspek_manual']
    y_pred_aspek = df['predicted_aspek_label']

    acc_aspek = accuracy_score(y_true_aspek, y_pred_aspek)
    print(f"Akurasi Aspek: {acc_aspek:.2f}")

    cm_aspek = confusion_matrix(y_true_aspek, y_pred_aspek, labels=list(aspek_map.values()))
    disp_aspek = ConfusionMatrixDisplay(confusion_matrix=cm_aspek, display_labels=list(aspek_map.values()))
    disp_aspek.plot(cmap='Blues', xticks_rotation=45)
    plt.title("Confusion Matrix Aspek")
    plt.tight_layout()
    plt.show()
else:
    print("Kolom aspek_manual tidak ditemukan, evaluasi aspek dilewati.")

# Untuk sentimen
if 'sentimen_lexicon' in df.columns:
    y_true_sentimen = df['sentimen_lexicon'].str.lower()
    y_pred_sentimen = df['predicted_sentimen_lexicon']

    labels_sentimen = ['positif','netral','negatif']
    acc_sentimen = accuracy_score(y_true_sentimen, y_pred_sentimen)
    print(f"Akurasi Sentimen: {acc_sentimen:.2f}")

    cm_sentimen = confusion_matrix(y_true_sentimen, y_pred_sentimen, labels=labels_sentimen)
    disp_sentimen = ConfusionMatrixDisplay(confusion_matrix=cm_sentimen, display_labels=labels_sentimen)
    disp_sentimen.plot(cmap='Greens')
    plt.title("Confusion Matrix Sentimen")
    plt.tight_layout()
    plt.show()
else:
    print("Kolom sentimen_lexicon tidak ditemukan, evaluasi sentimen dilewati.")


In [None]:
# ---------------------------
# 7. Visualisasi Aspek dan Sentimen per Ponpes
# ---------------------------

plt.figure(figsize=(12,6))
sns.countplot(data=df, x='ponpes', hue='predicted_aspek_label')
plt.title("Distribusi Aspek per Ponpes")
plt.xticks(rotation=45)
plt.legend(title='Aspek', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12,6))
sns.countplot(data=df, x='ponpes', hue='predicted_sentimen_lexicon')
plt.title("Distribusi Sentimen per Ponpes")
plt.xticks(rotation=45)
plt.legend(title='Sentimen', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Heatmap jumlah kalimat per aspek dan sentimen (gabungan)
pivot = df.pivot_table(index='predicted_aspek_label', columns='predicted_sentimen_lexicon', aggfunc='size', fill_value=0)
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Jumlah Kalimat per Aspek dan Sentimen")
plt.ylabel("Aspek")
plt.xlabel("Sentimen")
plt.tight_layout()
plt.show()


In [None]:
# ---------------------------
# 8. Simpan hasil akhir
# ---------------------------

df.to_csv("hasil_analisis_model.csv", index=False)
print("Selesai, hasil disimpan ke 'hasil_analisis model.csv'")