In [None]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Libraries untuk NLP dan Text Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Libraries untuk Topic Modeling dan Machine Learning
from gensim import corpora, models
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Libraries untuk Transformers (mT5)
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Download NLTK data yang diperlukan
try:
    nltk.download('punkt')
    nltk.download('stopwords')
except:
    pass

print("=== ANALISIS SENTIMEN DAN ASPEK ULASAN PESANTREN ===\n")

In [None]:
# =======================
# 1. LOADING DATA
# =======================
print("1. Loading Data...")

try:
    # Load dataset utama
    df = pd.read_csv('hasil_sentimen_pesantren.csv')
    print(f"Dataset utama berhasil dimuat: {df.shape}")
    print(f"============text combined===========")
    print(df['text_combined'])
    
    # Load lexicon kata positif dan negatif
    positive_words = pd.read_csv('positive.csv')['word'].tolist()
    negative_words = pd.read_csv('negative.csv')['word'].tolist()
    print(f"Kata positif: {len(positive_words)}, Kata negatif: {len(negative_words)}")
    
    # Display info dataset
    print("\nInfo Dataset:")
    print(df.info())
    print("\nSample data:")
    print(df.head())
    
except Exception as e:
    print(f"Error loading data: {e}")
    print("Pastikan file CSV tersedia dan nama kolom sesuai")


In [None]:
# =======================
# 2. TEXT PREPROCESSING
# =======================
print("\n2. Text Preprocessing...")

# Inisialisasi tools preprocessing bahasa Indonesia
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Tambahan stopwords bahasa Indonesia
indonesian_stopwords = set(stopwords.words('indonesian'))
custom_stopwords = {'yang', 'ini', 'itu', 'dengan', 'untuk', 'pada', 'dalam', 'dari', 'ke', 'di', 'dan', 'atau', 'adalah', 'akan', 'sudah', 'telah', 'dapat', 'bisa', 'harus', 'juga', 'saja', 'hanya', 'sangat', 'lebih', 'paling', 'agak', 'cukup'}
all_stopwords = indonesian_stopwords.union(custom_stopwords)

def clean_text(text):
    """Membersihkan teks dari karakter tidak perlu"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs, mentions, hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_text(text, remove_stopwords=True, do_stemming=False):
    """Preprocessing teks lengkap"""
    if pd.isna(text) or text == "":
        return ""
    
    # Clean text
    text = clean_text(text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords:
        tokens = [word for word in tokens if word not in all_stopwords and len(word) > 2]
    
    # Stemming (optional)
    if do_stemming:
        tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing
print("Applying text preprocessing...")
df['text_cleaned'] = df['text_combined'].apply(lambda x: preprocess_text(x, remove_stopwords=False))
df['text_processed'] = df['text_combined'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

print("Text preprocessing selesai!")
print("====================")
print(df["text_processed"])


In [None]:
# 3. TEXT REFINEMENT dengan mT5
# =======================
print("\n3. Text Refinement menggunakan mT5...")

class TextRefiner:
    def __init__(self):
        self.model_name = "cahya/t5-base-indonesian-summarization-cased"
        try:
            print("Loading mT5 model...")
            self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
            self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            print(f"Model loaded successfully on {self.device}")
        except Exception as e:
            print(f"Error loading model: {e}")
            print("Menggunakan text cleaning sederhana sebagai alternatif")
            self.model = None
    
    def refine_text(self, text, max_length=512):
        """Memperbaiki kalimat menggunakan mT5"""
        if self.model is None:
            # Fallback: simple text improvement
            return self.simple_text_improvement(text)
        
        if pd.isna(text) or text == "":
            return ""
        
        try:
            # Prepare input for paraphrasing
            input_text = f"paraphrase: {text}"
            
            # Tokenize
            inputs = self.tokenizer.encode(
                input_text, 
                return_tensors="pt", 
                max_length=max_length, 
                truncation=True
            ).to(self.device)
            
            # Generate paraphrase
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_length=max_length,
                    num_beams=4,
                    early_stopping=True,
                    temperature=0.7,
                    do_sample=True
                )
            
            # Decode output
            refined_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            return refined_text if refined_text != "" else text
            
        except Exception as e:
            print(f"Error refining text: {e}")
            return self.simple_text_improvement(text)
    
    def simple_text_improvement(self, text):
        """Perbaikan teks sederhana sebagai fallback"""
        if pd.isna(text) or text == "":
            return ""
        
        # Basic sentence structure improvement
        text = str(text).strip()
        
        # Capitalize first letter
        if text:
            text = text[0].upper() + text[1:]
        
        # Ensure proper sentence ending
        if text and text[-1] not in '.!?':
            text += '.'
        
        # Fix common spacing issues
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\s+([.!?])', r'\1', text)
        
        return text

# Initialize text refiner
refiner = TextRefiner()

# Apply text refinement (proses ini mungkin memakan waktu)
print("Melakukan text refinement...")
if len(df) > 100:
    print("Dataset besar terdeteksi. Memproses sample 100 data pertama untuk demo...")
    sample_df = df.head(100).copy()
else:
    sample_df = df.copy()

# Batch processing untuk efisiensi
batch_size = 10
refined_texts = []

for i in range(0, len(sample_df), batch_size):
    batch = sample_df.iloc[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(len(sample_df)-1)//batch_size + 1}")
    
    for text in batch['text_cleaned']:
        refined_text = refiner.refine_text(text)
        refined_texts.append(refined_text)

sample_df['text_refined'] = refined_texts
print("Text refinement selesai!")
print("===========================")
print(sample_df["text_refined"])



3. Text Refinement menggunakan T5 Bahasa Indonesia...
Loading T5 model...
Error loading model: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.

Menggunakan text cleaning sederhana sebagai alternatif
Melakukan text refinement...
Dataset besar terdeteksi. Memproses sample 100 data pertama untuk demo...
Processing batch 1/10
Processing batch 2/10
Processing batch 3/10
Processing batch 4/10
Processing batch 5/10
Processing batch 6/10
Processing batch 7/10
Processing batch 8/10
Processing batch 9/10
Processing batch 10/10
Text refinement selesai!
                                         text_cleaned  \
0   masyaallah santri ppm al ikhlash serah bantu k...   
1   pimpin pondok keren pimpin al ikhlash terima k...  

In [None]:
# =======================
# 4. TOPIC MODELING dengan LDA
# =======================
print("\n4. Topic Modeling dengan LDA...")

# Definisi 10 aspek
aspect_labels = {
    0: "Kualitas Guru",
    1: "Fasilitas", 
    2: "Lingkungan",
    3: "Kegiatan Pondok",
    4: "Pembinaan Karakter",
    5: "Prestasi",
    6: "Akademik",
    7: "Motivasi/Spiritual",
    8: "Sosial",
    9: "Umum"
}

def prepare_lda_data(texts):
    """Mempersiapkan data untuk LDA"""
    # Tokenize and remove stopwords
    processed_texts = []
    for text in texts:
        if pd.notna(text) and text != "":
            # Simple preprocessing for LDA
            tokens = simple_preprocess(text, deacc=True, min_len=3)
            tokens = [token for token in tokens if token not in all_stopwords]
            if len(tokens) > 3:  # Only include documents with sufficient tokens
                processed_texts.append(tokens)
    
    return processed_texts

# Prepare data untuk LDA
print("Preparing data for LDA...")
lda_texts = prepare_lda_data(sample_df['text_refined'])

if len(lda_texts) > 0:
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(lda_texts)
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(text) for text in lda_texts]
    
    print(f"Dictionary size: {len(dictionary)}")
    print(f"Corpus size: {len(corpus)}")
    
    # Train LDA model
    print("Training LDA model...")
    num_topics = 10
    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
        alpha='auto',
        per_word_topics=True,
        minimum_probability=0.01
    )
    
    print("LDA model training selesai!")
    
    # Get topic predictions
    def get_dominant_topic(text):
        """Mendapatkan topik dominan dari teks"""
        if pd.isna(text) or text == "":
            return 9  # Default to "Umum"
        
        tokens = simple_preprocess(text, deacc=True, min_len=3)
        tokens = [token for token in tokens if token not in all_stopwords]
        
        if len(tokens) == 0:
            return 9
        
        bow = dictionary.doc2bow(tokens)
        if len(bow) == 0:
            return 9
        
        topic_probs = lda_model.get_document_topics(bow)
        if len(topic_probs) == 0:
            return 9
        
        # Get topic with highest probability
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        return dominant_topic
    
    # Apply topic prediction
    print("Predicting aspects...")
    sample_df['aspek_predicted'] = sample_df['text_refined'].apply(get_dominant_topic)
    
    # Display topic keywords
    print("\nTop keywords per topic:")
    for idx, topic in lda_model.print_topics(num_words=10):
        print(f"Topic {idx} ({aspect_labels[idx]}): {topic}")
        
else:
    print("Tidak cukup data untuk training LDA. Menggunakan random assignment.")
    sample_df['aspek_predicted'] = np.random.randint(0, 10, len(sample_df))



In [None]:
# =======================
# 5. SENTIMENT ANALYSIS (Lexicon-based)
# =======================
print("\n5. Sentiment Analysis (Lexicon-based)...")

def analyze_sentiment_lexicon(text, positive_words, negative_words):
    """Analisis sentimen berbasis lexicon"""
    if pd.isna(text) or text == "":
        return 'netral'
    
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Count positive and negative words
    pos_count = sum(1 for word in tokens if word in positive_words)
    neg_count = sum(1 for word in tokens if word in negative_words)
    
    # Determine sentiment
    if pos_count > neg_count:
        return 'positif'
    elif neg_count > pos_count:
        return 'negatif'
    else:
        return 'netral'

# Apply sentiment analysis
print("Analyzing sentiment...")
sample_df['sentimen_predicted'] = sample_df['text_refined'].apply(
    lambda x: analyze_sentiment_lexicon(x, positive_words, negative_words)
)

print("Sentiment analysis selesai!")


In [None]:
# =======================
# 6. EVALUASI MODEL
# =======================
print("\n6. Evaluasi Model...")

# Pastikan sample_df memiliki kolom yang dibutuhkan untuk evaluasi
if 'aspek_manual' in sample_df.columns and 'sentimen_lexicon' in sample_df.columns:
    
    # Evaluasi Aspek
    print("=== EVALUASI ASPEK ===")
    
    # Convert aspek manual ke numeric jika berupa string
    if sample_df['aspek_manual'].dtype == 'object':
        le_aspect = LabelEncoder()
        sample_df['aspek_manual_encoded'] = le_aspect.fit_transform(sample_df['aspek_manual'].astype(str))
    else:
        sample_df['aspek_manual_encoded'] = sample_df['aspek_manual']
    
    # Hitung akurasi aspek
    aspect_accuracy = accuracy_score(sample_df['aspek_manual_encoded'], sample_df['aspek_predicted'])
    print(f"Akurasi Prediksi Aspek: {aspect_accuracy:.4f}")
    
    # Confusion matrix aspek
    aspect_cm = confusion_matrix(sample_df['aspek_manual_encoded'], sample_df['aspek_predicted'])
    
    # Classification report aspek
    aspect_report = classification_report(
        sample_df['aspek_manual_encoded'], 
        sample_df['aspek_predicted'],
        target_names=[aspect_labels[i] for i in range(10)],
        zero_division=0
    )
    print("Classification Report - Aspek:")
    print(aspect_report)
    
    # Evaluasi Sentimen
    print("\n=== EVALUASI SENTIMEN ===")
    
    # Hitung akurasi sentimen
    sentiment_accuracy = accuracy_score(sample_df['sentimen_lexicon'], sample_df['sentimen_predicted'])
    print(f"Akurasi Prediksi Sentimen: {sentiment_accuracy:.4f}")
    
    # Confusion matrix sentimen
    sentiment_cm = confusion_matrix(sample_df['sentimen_lexicon'], sample_df['sentimen_predicted'])
    
    # Classification report sentimen
    sentiment_report = classification_report(
        sample_df['sentimen_lexicon'], 
        sample_df['sentimen_predicted'],
        zero_division=0
    )
    print("Classification Report - Sentimen:")
    print(sentiment_report)
    
else:
    print("Kolom aspek_manual atau sentimen_lexicon tidak ditemukan. Melewati evaluasi.")
    aspect_accuracy = 0
    sentiment_accuracy = 0

In [None]:
# =======================
# 7. VISUALISASI
# =======================
print("\n7. Membuat Visualisasi...")

# Set style untuk plot
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Create subplots
fig = plt.figure(figsize=(20, 15))

# 1. Distribusi Aspek
plt.subplot(3, 3, 1)
aspect_counts = sample_df['aspek_predicted'].value_counts().sort_index()
aspect_names = [aspect_labels[i] for i in aspect_counts.index]
plt.bar(range(len(aspect_counts)), aspect_counts.values, color='skyblue')
plt.title('Distribusi Aspek (Prediksi)', fontsize=12, fontweight='bold')
plt.xlabel('Aspek')
plt.ylabel('Jumlah')
plt.xticks(range(len(aspect_counts)), aspect_names, rotation=45, ha='right')
plt.tight_layout()

# 2. Distribusi Sentimen
plt.subplot(3, 3, 2)
sentiment_counts = sample_df['sentimen_predicted'].value_counts()
colors = {'positif': 'green', 'negatif': 'red', 'netral': 'orange'}
plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%',
        colors=[colors.get(x, 'gray') for x in sentiment_counts.index])
plt.title('Distribusi Sentimen (Prediksi)', fontsize=12, fontweight='bold')

# 3. Distribusi per Pondok Pesantren
plt.subplot(3, 3, 3)
ponpes_counts = sample_df['ponpes'].value_counts().head(10)
plt.barh(range(len(ponpes_counts)), ponpes_counts.values, color='lightcoral')
plt.title('Top 10 Pondok Pesantren (Jumlah Ulasan)', fontsize=12, fontweight='bold')
plt.xlabel('Jumlah Ulasan')
plt.yticks(range(len(ponpes_counts)), ponpes_counts.index)
plt.gca().invert_yaxis()

# 4. Heatmap Aspek vs Sentimen
plt.subplot(3, 3, 4)
# Create crosstab
aspect_sentiment_crosstab = pd.crosstab(sample_df['aspek_predicted'], sample_df['sentimen_predicted'])
sns.heatmap(aspect_sentiment_crosstab, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Heatmap: Aspek vs Sentimen', fontsize=12, fontweight='bold')
plt.xlabel('Sentimen')
plt.ylabel('Aspek')

# 5. Confusion Matrix - Aspek (jika ada data manual)
if 'aspek_manual_encoded' in sample_df.columns:
    plt.subplot(3, 3, 5)
    sns.heatmap(aspect_cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - Aspek\n(Accuracy: {aspect_accuracy:.3f})', fontsize=12, fontweight='bold')
    plt.xlabel('Prediksi')
    plt.ylabel('Aktual')

# 6. Confusion Matrix - Sentimen (jika ada data manual)
if 'sentimen_lexicon' in sample_df.columns:
    plt.subplot(3, 3, 6)
    sns.heatmap(sentiment_cm, annot=True, fmt='d', cmap='Greens')
    plt.title(f'Confusion Matrix - Sentimen\n(Accuracy: {sentiment_accuracy:.3f})', fontsize=12, fontweight='bold')
    plt.xlabel('Prediksi')
    plt.ylabel('Aktual')

# 7. Distribusi Panjang Teks
plt.subplot(3, 3, 7)
sample_df['text_length'] = sample_df['text_refined'].astype(str).apply(len)
plt.hist(sample_df['text_length'], bins=30, color='mediumpurple', alpha=0.7)
plt.title('Distribusi Panjang Teks (Refined)', fontsize=12, fontweight='bold')
plt.xlabel('Panjang Karakter')
plt.ylabel('Frekuensi')

# 8. Word Cloud untuk setiap sentimen (simulasi dengan bar chart kata populer)
plt.subplot(3, 3, 8)
# Get most common words for positive sentiment
positive_texts = sample_df[sample_df['sentimen_predicted'] == 'positif']['text_refined']
all_positive_words = ' '.join(positive_texts.astype(str)).lower().split()
positive_word_freq = Counter([word for word in all_positive_words if len(word) > 3])
top_positive_words = dict(positive_word_freq.most_common(10))

if top_positive_words:
    plt.bar(range(len(top_positive_words)), list(top_positive_words.values()), color='lightgreen')
    plt.title('Kata Populer - Sentimen Positif', fontsize=12, fontweight='bold')
    plt.xticks(range(len(top_positive_words)), list(top_positive_words.keys()), rotation=45, ha='right')
    plt.ylabel('Frekuensi')

# 9. Summary Statistics
plt.subplot(3, 3, 9)
# Create summary table
summary_data = {
    'Metrik': ['Total Data', 'Aspek Unik', 'Sentimen Positif', 'Sentimen Negatif', 'Sentimen Netral'],
    'Nilai': [
        len(sample_df),
        sample_df['aspek_predicted'].nunique(),
        len(sample_df[sample_df['sentimen_predicted'] == 'positif']),
        len(sample_df[sample_df['sentimen_predicted'] == 'negatif']),
        len(sample_df[sample_df['sentimen_predicted'] == 'netral'])
    ]
}

# Simple text-based summary
plt.text(0.1, 0.7, f'RINGKASAN ANALISIS', fontsize=14, fontweight='bold')
plt.text(0.1, 0.6, f'Total Data: {len(sample_df)}', fontsize=11)
plt.text(0.1, 0.5, f'Aspek Unik: {sample_df["aspek_predicted"].nunique()}', fontsize=11)
plt.text(0.1, 0.4, f'Sentimen Positif: {len(sample_df[sample_df["sentimen_predicted"] == "positif"])}', fontsize=11)
plt.text(0.1, 0.3, f'Sentimen Negatif: {len(sample_df[sample_df["sentimen_predicted"] == "negatif"])}', fontsize=11)
plt.text(0.1, 0.2, f'Sentimen Netral: {len(sample_df[sample_df["sentimen_predicted"] == "netral"])}', fontsize=11)

if 'aspek_manual_encoded' in sample_df.columns:
    plt.text(0.1, 0.1, f'Akurasi Aspek: {aspect_accuracy:.3f}', fontsize=11)
if 'sentimen_lexicon' in sample_df.columns:
    plt.text(0.1, 0.05, f'Akurasi Sentimen: {sentiment_accuracy:.3f}', fontsize=11)

plt.axis('off')

plt.tight_layout()
plt.savefig('hasil_analisis_visualisasi.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# =======================
# 8. MENYIMPAN HASIL
# =======================
print("\n8. Menyimpan Hasil...")

# Prepare final results
sample_df['aspek_label'] = sample_df['aspek_predicted'].map(aspect_labels)

# Create final results dataframe
hasil_final = sample_df[[
    'no', 'ponpes', 'text_combined', 'text_refined',
    'aspek_predicted', 'aspek_label', 'sentimen_predicted'
]].copy()

# Add manual labels if available
if 'aspek_manual' in sample_df.columns:
    hasil_final['aspek_manual'] = sample_df['aspek_manual']
if 'sentimen_lexicon' in sample_df.columns:
    hasil_final['sentimen_manual'] = sample_df['sentimen_lexicon']

# Save results
hasil_final.to_csv('hasil_analisis.csv', index=False)
print("Hasil analisis berhasil disimpan ke 'hasil_analisis.csv'")

# Save model evaluation metrics
evaluation_results = {
    'total_data': len(sample_df),
    'aspek_accuracy': aspect_accuracy if 'aspek_manual_encoded' in sample_df.columns else 'N/A',
    'sentiment_accuracy': sentiment_accuracy if 'sentimen_lexicon' in sample_df.columns else 'N/A',
    'aspek_distribution': sample_df['aspek_predicted'].value_counts().to_dict(),
    'sentiment_distribution': sample_df['sentimen_predicted'].value_counts().to_dict()
}

# Save evaluation as JSON for further analysis
import json
with open('evaluation_metrics.json', 'w') as f:
    json.dump(evaluation_results, f, indent=2, ensure_ascii=False)

print("Metrics evaluasi disimpan ke 'evaluation_metrics.json'")


In [None]:
# =======================
# 9. RINGKASAN AKHIR
# =======================
print("\n" + "="*50)
print("RINGKASAN HASIL ANALISIS")
print("="*50)
print(f"Total data diproses: {len(sample_df)}")
print(f"Distribusi Aspek:")
for aspek_id, count in sample_df['aspek_predicted'].value_counts().sort_index().items():
    print(f"  {aspect_labels[aspek_id]}: {count}")

print(f"\nDistribusi Sentimen:")
for sentiment, count in sample_df['sentimen_predicted'].value_counts().items():
    print(f"  {sentiment.capitalize()}: {count}")

if 'aspek_manual_encoded' in sample_df.columns:
    print(f"\nAkurasi Prediksi Aspek: {aspect_accuracy:.4f}")
if 'sentimen_lexicon' in sample_df.columns:
    print(f"Akurasi Prediksi Sentimen: {sentiment_accuracy:.4f}")

print(f"\nFile output:")
print("- hasil_analisis.csv: Hasil lengkap analisis")
print("- evaluation_metrics.json: Metrics evaluasi")
print("- hasil_analisis_visualisasi.png: Visualisasi hasil")

print("\n=== ANALISIS SELESAI ===")


In [None]:
# =======================
# 10. SARAN OPTIMASI
# =======================
print("\n" + "="*50)
print("SARAN OPTIMASI DAN PENGEMBANGAN")
print("="*50)

saran_optimasi = """
1. TEXT REFINEMENT:
   - Gunakan GPU untuk mempercepat inferensi mT5
   - Pertimbangkan fine-tuning model mT5 pada domain pesantren
   - Batch processing untuk efisiensi memori

2. TOPIC MODELING:
   - Eksperimen dengan jumlah topik berbeda (5-15)
   - Gunakan coherence score untuk optimasi parameter
   - Pertimbangkan BERTopic untuk hasil yang lebih baik

3. SENTIMENT ANALYSIS:
   - Perluas lexicon dengan kata-kata domain pesantren
   - Gunakan word embedding untuk deteksi sinonim
   - Pertimbangkan ensemble dengan model deep learning

4. EVALUASI:
   - Gunakan cross-validation untuk evaluasi yang robust
   - Implementasikan metrics tambahan (F1-score, precision, recall)
   - Analisis error untuk improvement

5. VISUALISASI:
   - Tambahkan interactive plots dengan Plotly
   - Word cloud untuk setiap aspek
   - Time series analysis jika ada data temporal

6. SKALABILITAS:
   - Implementasi parallel processing
   - Database integration untuk dataset besar
   - API endpoint untuk real-time prediction
"""

print(saran_optimasi)

In [None]:
# OPTIMASI TAMBAHAN DAN FITUR LANJUTAN
# Untuk meningkatkan performa dan akurasi analisis

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time
from tqdm import tqdm
import multiprocessing as mp
from functools import partial

In [None]:
# =======================
# 1. OPTIMASI LDA DENGAN COHERENCE SCORE
# =======================

def optimize_lda_topics(texts, dictionary, corpus, topic_range=range(5, 16)):
    """Optimasi jumlah topik LDA menggunakan coherence score"""
    print("Optimizing LDA topic numbers...")
    
    coherence_scores = []
    perplexity_scores = []
    
    for num_topics in tqdm(topic_range, desc="Testing topic numbers"):
        # Train LDA model
        lda_model = models.LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=10,
            alpha='auto'
        )
        
        # Calculate coherence
        coherence_model_lda = CoherenceModel(
            model=lda_model,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_score = coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        
        # Calculate perplexity
        perplexity = lda_model.log_perplexity(corpus)
        perplexity_scores.append(perplexity)
        
        print(f"Topics: {num_topics}, Coherence: {coherence_score:.4f}, Perplexity: {perplexity:.4f}")
    
    # Find optimal number of topics
    optimal_topics = topic_range[np.argmax(coherence_scores)]
    
    # Plot results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    ax1.plot(topic_range, coherence_scores, 'bo-')
    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('Coherence Score')
    ax1.set_title('Coherence Score vs Number of Topics')
    ax1.axvline(x=optimal_topics, color='red', linestyle='--', label=f'Optimal: {optimal_topics}')
    ax1.legend()
    
    ax2.plot(topic_range, perplexity_scores, 'ro-')
    ax2.set_xlabel('Number of Topics')
    ax2.set_ylabel('Perplexity')
    ax2.set_title('Perplexity vs Number of Topics')
    
    plt.tight_layout()
    plt.savefig('lda_optimization.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return optimal_topics, coherence_scores, perplexity_scores


In [None]:
# =======================
# 2. ENHANCED LEXICON SENTIMENT ANALYSIS
# =======================

class EnhancedSentimentAnalyzer:
    def __init__(self, positive_words, negative_words):
        self.positive_words = set(positive_words)
        self.negative_words = set(negative_words)
        
        # Tambahan kata-kata khusus domain pesantren
        self.pesantren_positive = {
            'baik', 'bagus', 'hebat', 'luar biasa', 'excellent', 'terbaik',
            'berkualitas', 'memuaskan', 'sempurna', 'istimewa', 'mantap',
            'berkah', 'barokah', 'mulia', 'terpuji', 'amanah', 'sholeh',
            'islami', 'religius', 'spiritual', 'mendidik', 'bermanfaat'
        }
        
        self.pesantren_negative = {
            'buruk', 'jelek', 'tidak baik', 'mengecewakan', 'kurang',
            'minim', 'terbatas', 'kotor', 'rusak', 'tidak terawat',
            'tidak disiplin', 'tidak tertib', 'chaos', 'berantakan'
        }
        
        # Gabungkan dengan lexicon utama
        self.positive_words.update(self.pesantren_positive)
        self.negative_words.update(self.pesantren_negative)
        
        # Weight untuk kata-kata tertentu
        self.positive_weights = defaultdict(lambda: 1)
        self.negative_weights = defaultdict(lambda: 1)
        
        # Kata-kata dengan bobot tinggi
        high_weight_positive = ['excellent', 'luar biasa', 'terbaik', 'sempurna', 'istimewa']
        high_weight_negative = ['buruk', 'mengecewakan', 'tidak baik', 'jelek']
        
        for word in high_weight_positive:
            self.positive_weights[word] = 2
        for word in high_weight_negative:
            self.negative_weights[word] = 2
    
    def analyze_sentiment_enhanced(self, text):
        """Enhanced sentiment analysis dengan bobot dan konteks"""
        if pd.isna(text) or text == "":
            return 'netral', 0.0
        
        tokens = word_tokenize(text.lower())
        
        # Hitung skor dengan bobot
        pos_score = sum(self.positive_weights[word] for word in tokens if word in self.positive_words)
        neg_score = sum(self.negative_weights[word] for word in tokens if word in self.negative_words)
        
        # Normalisasi berdasarkan panjang teks
        text_length = len(tokens)
        if text_length > 0:
            pos_score = pos_score / text_length
            neg_score = neg_score / text_length
        
        # Tentukan sentimen dengan confidence score
        if pos_score > neg_score:
            confidence = pos_score / (pos_score + neg_score) if (pos_score + neg_score) > 0 else 0
            return 'positif', confidence
        elif neg_score > pos_score:
            confidence = neg_score / (pos_score + neg_score) if (pos_score + neg_score) > 0 else 0
            return 'negatif', confidence
        else:
            return 'netral', 0.0


In [None]:
# =======================
# 3. PARALLEL PROCESSING UNTUK TEXT REFINEMENT
# =======================

def process_text_batch(texts_batch, refiner):
    """Process batch of texts in parallel"""
    results = []
    for text in texts_batch:
        refined_text = refiner.refine_text(text)
        results.append(refined_text)
    return results

def parallel_text_refinement(texts, refiner, n_processes=None):
    """Parallel processing untuk text refinement"""
    if n_processes is None:
        n_processes = min(mp.cpu_count(), 4)  # Limit to 4 processes
    
    # Split texts into batches
    batch_size = max(1, len(texts) // n_processes)
    text_batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
    
    print(f"Processing {len(texts)} texts with {n_processes} processes...")
    
    # Process in parallel
    with mp.Pool(n_processes) as pool:
        batch_results = pool.map(partial(process_text_batch, refiner=refiner), text_batches)
    
    # Flatten results
    refined_texts = []
    for batch_result in batch_results:
        refined_texts.extend(batch_result)
    
    return refined_texts


In [None]:
# =======================
# 4. INTERACTIVE VISUALIZATIONS dengan PLOTLY
# =======================

def create_interactive_visualizations(df):
    """Membuat visualisasi interaktif dengan Plotly"""
    
    # 1. Interactive Aspect Distribution
    fig1 = px.bar(
        x=df['aspek_predicted'].value_counts().index,
        y=df['aspek_predicted'].value_counts().values,
        title="Distribusi Aspek (Interactive)",
        labels={'x': 'Aspek', 'y': 'Jumlah'},
        color=df['aspek_predicted'].value_counts().values,
        color_continuous_scale='viridis'
    )
    fig1.update_xaxis(title="Aspek")
    fig1.update_yaxis(title="Jumlah")
    fig1.write_html("distribusi_aspek_interactive.html")
    
    # 2. Interactive Sentiment Pie Chart
    sentiment_counts = df['sentimen_predicted'].value_counts()
    fig2 = px.pie(
        values=sentiment_counts.values,
        names=sentiment_counts.index,
        title="Distribusi Sentimen (Interactive)",
        color_discrete_map={
            'positif': '#2E8B57',
            'negatif': '#DC143C',
            'netral': '#FF8C00'
        }
    )
    fig2.write_html("distribusi_sentimen_interactive.html")
    
    # 3. Interactive Heatmap
    aspect_sentiment_crosstab = pd.crosstab(df['aspek_predicted'], df['sentimen_predicted'])
    fig3 = px.imshow(
        aspect_sentiment_crosstab.values,
        labels=dict(x="Sentimen", y="Aspek", color="Jumlah"),
        x=aspect_sentiment_crosstab.columns,
        y=aspect_sentiment_crosstab.index,
        title="Heatmap: Aspek vs Sentimen (Interactive)",
        color_continuous_scale='RdYlBu'
    )
    fig3.write_html("heatmap_aspek_sentimen_interactive.html")
    
    # 4. Multi-dimensional Analysis
    fig4 = px.scatter(
        df, 
        x='aspek_predicted', 
        y='sentimen_predicted',
        size='text_length' if 'text_length' in df.columns else None,
        color='ponpes',
        title="Analisis Multi-dimensi: Aspek vs Sentimen per Pesantren",
        hover_data=['ponpes', 'text_combined']
    )
    fig4.write_html("analisis_multidimensi_interactive.html")
    
    print("Interactive visualizations saved:")
    print("- distribusi_aspek_interactive.html")
    print("- distribusi_sentimen_interactive.html") 
    print("- heatmap_aspek_sentimen_interactive.html")
    print("- analisis_multidimensi_interactive.html")


In [None]:
# =======================
# 5. ADVANCED EVALUATION METRICS
# =======================

def comprehensive_evaluation(y_true, y_pred, labels=None):
    """Evaluasi komprehensif dengan berbagai metrics"""
    
    metrics = {}
    
    # Basic metrics
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['precision_macro'] = precision_score(y_true, y_pred, average='macro', zero_division=0)
    metrics['recall_macro'] = recall_score(y_true, y_pred, average='macro', zero_division=0)
    
    # Per-class metrics
    if labels is not None:
        f1_per_class = f1_score(y_true, y_pred, average=None, zero_division=0)
        precision_per_class = precision_score(y_true, y_pred, average=None, zero_division=0)
        recall_per_class = recall_score(y_true, y_pred, average=None, zero_division=0)
        
        for i, label in enumerate(labels):
            if i < len(f1_per_class):
                metrics[f'f1_{label}'] = f1_per_class[i]
                metrics[f'precision_{label}'] = precision_per_class[i]
                metrics[f'recall_{label}'] = recall_per_class[i]
    
    return metrics

def cross_validation_evaluation(X, y, model, cv=5):
    """Cross-validation evaluation"""
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    
    return {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'cv_scores': cv_scores.tolist()
    }


In [None]:
# =======================
# 6. AUTOMATED REPORT GENERATION
# =======================

def generate_analysis_report(df, metrics_dict, output_file='analisis_report.html'):
    """Generate comprehensive HTML report"""
    
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Laporan Analisis Sentimen dan Aspek Pesantren</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 40px; }}
            .header {{ background-color: #2E8B57; color: white; padding: 20px; text-align: center; }}
            .section {{ margin: 20px 0; padding: 15px; border-left: 4px solid #2E8B57; }}
            .metric {{ background-color: #f8f9fa; padding: 10px; margin: 5px 0; border-radius: 5px; }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
            .positive {{ color: #2E8B57; font-weight: bold; }}
            .negative {{ color: #DC143C; font-weight: bold; }}
            .neutral {{ color: #FF8C00; font-weight: bold; }}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>Laporan Analisis Sentimen dan Aspek Ulasan Pesantren</h1>
            <p>Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
        </div>
        
        <div class="section">
            <h2>Ringkasan Dataset</h2>
            <div class="metric">Total Data: {len(df)}</div>
            <div class="metric">Jumlah Pesantren: {df['ponpes'].nunique()}</div>
            <div class="metric">Rata-rata Panjang Teks: {df['text_refined'].astype(str).apply(len).mean():.1f} karakter</div>
        </div>
        
        <div class="section">
            <h2>Distribusi Sentimen</h2>
            <table>
                <tr><th>Sentimen</th><th>Jumlah</th><th>Persentase</th></tr>
    """
    
    # Add sentiment distribution
    sentiment_counts = df['sentimen_predicted'].value_counts()
    for sentiment, count in sentiment_counts.items():
        percentage = (count / len(df)) * 100
        css_class = sentiment.lower() if sentiment.lower() in ['positive', 'negative'] else 'neutral'
        html_content += f'<tr><td class="{css_class}">{sentiment.capitalize()}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>'
    
    html_content += """
            </table>
        </div>
        
        <div class="section">
            <h2>Distribusi Aspek</h2>
            <table>
                <tr><th>ID</th><th>Aspek</th><th>Jumlah</th><th>Persentase</th></tr>
    """
    
    # Add aspect distribution
    aspect_labels = {
        0: "Kualitas Guru", 1: "Fasilitas", 2: "Lingkungan", 3: "Kegiatan Pondok",
        4: "Pembinaan Karakter", 5: "Prestasi", 6: "Akademik", 7: "Motivasi/Spiritual",
        8: "Sosial", 9: "Umum"
    }
    
    aspect_counts = df['aspek_predicted'].value_counts().sort_index()
    for aspect_id, count in aspect_counts.items():
        percentage = (count / len(df)) * 100
        aspect_name = aspect_labels.get(aspect_id, f"Aspek {aspect_id}")
        html_content += f'<tr><td>{aspect_id}</td><td>{aspect_name}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>'
    
    html_content += """
            </table>
        </div>
        
        <div class="section">
            <h2>Metrics Evaluasi</h2>
    """
    
    # Add evaluation metrics if available
    for metric_name, metric_value in metrics_dict.items():
        if isinstance(metric_value, float):
            html_content += f'<div class="metric">{metric_name}: {metric_value:.4f}</div>'
        else:
            html_content += f'<div class="metric">{metric_name}: {metric_value}</div>'
    
    html_content += """
        </div>
        
        <div class="section">
            <h2>Top Pesantren (Berdasarkan Jumlah Ulasan)</h2>
            <table>
                <tr><th>Nama Pesantren</th><th>Jumlah Ulasan</th></tr>
    """
    
    # Add top pesantren
    top_pesantren = df['ponpes'].value_counts().head(10)
    for pesantren, count in top_pesantren.items():
        html_content += f'<tr><td>{pesantren}</td><td>{count}</td></tr>'
    
    html_content += """
            </table>
        </div>
        
        <div class="section">
            <h2>Rekomendasi</h2>
            <ul>
                <li>Lakukan monitoring berkala untuk tren sentimen</li>
                <li>Fokus pada aspek dengan sentimen negatif tinggi</li>
                <li>Tingkatkan kualitas pada aspek yang sering dikomentari</li>
                <li>Manfaatkan feedback positif untuk promosi</li>
            </ul>
        </div>
    </body>
    </html>
    """
    
    # Save report
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"Comprehensive report saved to: {output_file}")


In [None]:
# =======================
# 7. MAIN EXECUTION FUNCTION
# =======================

def run_enhanced_analysis(df, positive_words, negative_words):
    """Run enhanced analysis with all optimizations"""
    
    print("=== ENHANCED ANALYSIS STARTED ===\n")
    
    # 1. Enhanced Sentiment Analysis
    print("1. Running Enhanced Sentiment Analysis...")
    enhanced_analyzer = EnhancedSentimentAnalyzer(positive_words, negative_words)
    
    sentiment_results = df['text_refined'].apply(enhanced_analyzer.analyze_sentiment_enhanced)
    df['sentimen_enhanced'] = [result[0] for result in sentiment_results]
    df['sentiment_confidence'] = [result[1] for result in sentiment_results]
    
    # 2. Create Interactive Visualizations
    print("2. Creating Interactive Visualizations...")
    create_interactive_visualizations(df)
    
    # 3. Comprehensive Evaluation
    print("3. Running Comprehensive Evaluation...")
    evaluation_metrics = {}
    
    if 'sentimen_lexicon' in df.columns:
        sentiment_metrics = comprehensive_evaluation(
            df['sentimen_lexicon'], 
            df['sentimen_enhanced'],
            labels=['positif', 'negatif', 'netral']
        )
        evaluation_metrics.update({f'sentiment_{k}': v for k, v in sentiment_metrics.items()})
    
    if 'aspek_manual' in df.columns:
        # Convert to numeric if needed
        if df['aspek_manual'].dtype == 'object':
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            aspek_manual_numeric = le.fit_transform(df['aspek_manual'].astype(str))
        else:
            aspek_manual_numeric = df['aspek_manual']
        
        aspect_metrics = comprehensive_evaluation(
            aspek_manual_numeric,
            df['aspek_predicted'],
            labels=list(range(10))
        )
        evaluation_metrics.update({f'aspect_{k}': v for k, v in aspect_metrics.items()})
    
    # 4. Generate Comprehensive Report
    print("4. Generating Comprehensive Report...")
    generate_analysis_report(df, evaluation_metrics)
    
    # 5. Save Enhanced Results
    print("5. Saving Enhanced Results...")
    df.to_csv('hasil_analisis_enhanced.csv', index=False)
    
    # Save metrics
    import json
    with open('enhanced_metrics.json', 'w', encoding='utf-8') as f:
        json.dump(evaluation_metrics, f, indent=2, ensure_ascii=False)
    
    print("\n=== ENHANCED ANALYSIS COMPLETED ===")
    print("Output files:")
    print("- hasil_analisis_enhanced.csv")
    print("- enhanced_metrics.json") 
    print("- analisis_report.html")
    print("- Interactive HTML visualizations")
    
    return df, evaluation_metrics

# Example usage (uncomment to run):
# if __name__ == "__main__":
#     # Load your data
#     df = pd.read_csv('hasil_sentimen_pesantren.csv')
#     positive_words = pd.read_csv('positive.csv')['word'].tolist()
#     negative_words = pd.read_csv('negative.csv')['word'].tolist()
#     
#     # Run enhanced analysis
#     enhanced_df, metrics = run_enhanced_analysis(df, positive_words, negative_words)