In [1]:
# Import Library
# !pip install pandas numpy gensim tensorflow scikit-learn Sastrawi matplotlib

import pandas as pd
import numpy as np
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

print("berhasil mengimport library")

berhasil mengimport library


In [4]:
# Load dataset
data = pd.read_excel("2000_Kalimat_Kinerja_Pesantren_Unik.xlsx")
print("Contoh 5 data pertama:")
print(data.head())
print("\nDistribusi Sentimen:")
print(data["Sentimen"].value_counts())

Contoh 5 data pertama:
   No                                            Kalimat                Aspek  \
0   1  Pengajar memiliki kompetensi memberikan perhat...        Kualitas Guru   
1   2  Pengalaman guru masih perlu ditingkatkan. Meng...        Kualitas Guru   
2   3  Beberapa guru masih perlu ditingkatkan. Mengur...        Kualitas Guru   
3   4  Prestasi santri meraih prestasi akademik dan n...             Prestasi   
4   5  Proses belajar mengajar sangat efektif dan men...  Kualitas Pengajaran   

  Sentimen             Pesantren  
0  Positif  As-Salafiyah Parappe  
1  Negatif    Syekh Hasan Yamani  
2  Negatif  As-Salafiyah Parappe  
3  Positif            Al-Ikhlash  
4  Positif            Al-Ikhlash  

Distribusi Sentimen:
Sentimen
Positif    108
Negatif    108
Netral     108
Name: count, dtype: int64


In [5]:
# text processing
# Fungsi cleaning teks
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)  # Hapus URL, mention, hashtag
    text = re.sub(r'[^\w\s]', '', text)  # Hapus tanda baca
    text = text.lower()  # Case folding
    
    # Stopword removal
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    text = stopword.remove(text)
    
    # Stemming
    stemmer = StemmerFactory().create_stemmer()
    text = stemmer.stem(text)
    return text

# Apply cleaning
data["Cleaned_Text"] = data["Kalimat"].apply(clean_text)
print("\nContoh teks setelah cleaning:")
print(data[["Kalimat", "Cleaned_Text"]].head())


Contoh teks setelah cleaning:
                                             Kalimat  \
0  Pengajar memiliki kompetensi memberikan perhat...   
1  Pengalaman guru masih perlu ditingkatkan. Meng...   
2  Beberapa guru masih perlu ditingkatkan. Mengur...   
3  Prestasi santri meraih prestasi akademik dan n...   
4  Proses belajar mengajar sangat efektif dan men...   

                                        Cleaned_Text  
0  ajar milik kompetensi beri perhati penuh santr...  
1       alam guru perlu tingkat kurang kualitas ajar  
2   beberapa guru perlu tingkat kurang kualitas ajar  
3  prestasi santri raih prestasi akademik nonakad...  
4  proses ajar ajar sangat efektif tarik ajar lan...  


In [6]:
# pembagian train data dan testing data

# Encoding sentimen (Negatif:0, Netral:1, Positif:2)
sentimen_map = {"Negatif": 0, "Netral": 1, "Positif": 2}
data["Sentimen_Encoded"] = data["Sentimen"].map(sentimen_map)

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    data["Cleaned_Text"], 
    data["Sentimen_Encoded"], 
    test_size=0.2, 
    stratify=data["Sentimen_Encoded"],
    random_state=42
)
print("\nJumlah data training dan testing:")
print(f"Training: {len(X_train)}, Testing: {len(X_test)}")
print("Distribusi label training:")
print(y_train.value_counts())


Jumlah data training dan testing:
Training: 259, Testing: 65
Distribusi label training:
Sentimen_Encoded
0    87
2    86
1    86
Name: count, dtype: int64


In [7]:
# oversampling pada vector
# Gabungkan X_train dan y_train
train_data = pd.DataFrame({"text": X_train, "label": y_train})

# Pisahkan per kelas
df_netral = train_data[train_data["label"] == 1]
df_positif = train_data[train_data["label"] == 2]
df_negatif = train_data[train_data["label"] == 0]

# Oversampling positif & negatif
df_positif_oversampled = resample(df_positif, replace=True, n_samples=len(df_netral), random_state=42)
df_negatif_oversampled = resample(df_negatif, replace=True, n_samples=len(df_netral), random_state=42)

# Gabungkan kembali
train_data_balanced = pd.concat([df_netral, df_positif_oversampled, df_negatif_oversampled])
print("\nDistribusi label setelah oversampling:")
print(train_data_balanced["label"].value_counts())


Distribusi label setelah oversampling:
label
1    86
2    86
0    86
Name: count, dtype: int64
