# **1. Import Library**

In [47]:
import pandas as pd
import re
import string
import nltk
import numpy as np
import joblib
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder




# **2. Load Dataset**

In [48]:
rs = pd.read_csv("dataset/final-dataset.csv", dtype=str) 
rs.head()

Unnamed: 0,Nama,Rating,Ulasan
0,Nia Devi,4 bintang,"Dari sblm lahir, lahiran trs baby juga di rawa..."
1,titania purnamasari,1 bintang,"Gedung elit, parkir di basement sulit. Baru am..."
2,Farid Add,5 bintang,Sungguh luar biasa bagi saya untuk pelayanan t...
3,Susi Mashanafi,1 bintang,"Tes napza disini, udah selesai tinggal nunggu ..."
4,Basuki Dwi Putranto,5 bintang,Selamat atas telah beroperasinya Gedung Baru K...


# **3. Memahami Struktur Data**

In [49]:
rs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3461 entries, 0 to 3460
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Nama    3461 non-null   object
 1   Rating  3461 non-null   object
 2   Ulasan  3461 non-null   object
dtypes: object(3)
memory usage: 81.2+ KB


# **4. Mengecheck data kosong**

In [50]:
#mengecheck data hilang
rs.isnull().sum()

Nama      0
Rating    0
Ulasan    0
dtype: int64

# **5. Ekstraksi fitur dan pelabelan data**

In [51]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:

def clean_rating(rating):
    if pd.isna(rating) or rating.strip() == "":  
        return None
    angka = re.findall(r"\d+", rating) 
    return int(angka[0]) if angka else None 

rs["Rating"] = rs["Rating"].astype(str).apply(clean_rating)


In [54]:
# Pastikan rating dalam bentuk string dulu
rs["Rating"] = rs["Rating"].astype(str)

# Ekstrak angka dari string "5 bintang", "2 bintang", dst
rs["Rating"] = rs["Rating"].str.extract('(\d)').astype(float)

# Label sentimen
def label_sentiment(rating):
    if rating >= 4:
        return "Positif"
    elif rating == 3:
        return "Netral"
    else:
        return "Negatif"

rs["Sentimen"] = rs["Rating"].apply(label_sentiment)


In [55]:
print(rs["Sentimen"].value_counts())

Sentimen
Negatif    1353
Positif    1347
Netral      761
Name: count, dtype: int64


In [56]:
print(rs["Rating"].value_counts())

Rating
1.0    1353
5.0    1012
3.0     761
4.0     335
Name: count, dtype: int64


In [57]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Normalisasi slang / kata tidak baku
    slang_map = {
        "ga": "tidak",
        "gak": "tidak",
        "nggak": "tidak",
        "ngga": "tidak",
        "cuman": "cuma",
        "udh": "sudah",
        "dr": "dari",
        "bgt": "banget",
    }
    for slang, formal in slang_map.items():
        text = re.sub(rf'\b{slang}\b', formal, text)

    # Gabungkan frasa negasi umum
    negation_patterns = [
        (r'\btidak nyaman\b', 'tidak_nyaman'),
        (r'\btidak enak\b', 'tidak_enak'),
        (r'\btidak bagus\b', 'tidak_bagus'),
        (r'\btidak bersih\b', 'tidak_bersih'),
        (r'\btidak ramah\b', 'tidak_ramah'),
        (r'\btidak kelar\b', 'tidak_kelar'),
        (r'\btidak jelas\b', 'tidak_jelas'),
        (r'\bburuk sekali\b', 'buruk_sekali'),
        (r'\bjangan ke sini\b', 'jangan_ke_sini'),
    ]

    for pattern, replacement in negation_patterns:
        text = re.sub(pattern, replacement, text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text


rs["Ulasan_Clean"] = rs["Ulasan"].astype(str).apply(clean_text)
print(rs[["Ulasan", "Ulasan_Clean"]].sample(5))


                                                 Ulasan  \
2662  Pelayanannya buruk sekali, pindah ke gedung ba...   
1054  Pelayanannya buruk sekali, pindah ke gedung ba...   
1063  Kami sadar hanya pasien kelas 3 PBI. Tapi kami...   
214   Pelayanannya buruk sekali, pindah ke gedung ba...   
2644  Selamat atas telah beroperasinya Gedung Baru K...   

                                           Ulasan_Clean  
2662  pelayanannya buruk_sekali pindah ke gedung bar...  
1054  pelayanannya buruk_sekali pindah ke gedung bar...  
1063  kami sadar hanya pasien kelas pbi tapi kami ti...  
214   pelayanannya buruk_sekali pindah ke gedung bar...  
2644  selamat atas telah beroperasinya gedung baru k...  


# *Tokenisasi Teks*

In [58]:
print(rs.columns)


Index(['Nama', 'Rating', 'Ulasan', 'Sentimen', 'Ulasan_Clean'], dtype='object')


In [59]:
# Inisialisasi tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(rs["Ulasan_Clean"]) # Melatih tokenizer
vocab_size = len(tokenizer.word_index) + 1


# Konversi teks menjadi urutan angka
sequences = tokenizer.texts_to_sequences(rs["Ulasan_Clean"])
# Padding sequences agar memiliki panjang yang sama
max_length = 50  # Panjang maksimum ulasan
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post", dtype="int32")
vocab_size = 5000
# Cek hasil tokenisasi
print(padded_sequences[:5])

[[  6  63  64  65  66  67  15   2  51   2  68   6  69  37   3  70  71  42
   46  72  73  74  75  76  12  77  78  18  13   4   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  7  79  14   2  80  81  10  82  83  14   2   7  10  56  84   3  53  85
   43  41  14  86  47  57  87  88   4   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 26  20  21  89  27  12   9  28   2  22  90  23  29   6  91  54  92  27
   93  94   5  95  96  44  97   9   8  26  20  21  98  99   4   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [100 101 102 103 104 105 106 107   8  30   3 108  30  55  48 109  34   3
  110  15 111 112 113 114 115   4   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 58  40 116 117   7  10 118 119  39  49   9   8 120  12 121  59 122   5
   23  28   2 123   5 124   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   

In [60]:
print("Word Index:", tokenizer.word_index)


Word Index: {'<OOV>': 1, 'di': 2, 'tidak': 3, '…': 4, 'dan': 5, 'dari': 6, 'gedung': 7, 'yang': 8, 'pelayanan': 9, 'baru': 10, 'dengan': 11, 'untuk': 12, 'pasien': 13, 'parkir': 14, 'juga': 15, 'kami': 16, 'ada': 17, 'ke': 18, 'hanya': 19, 'luar': 20, 'biasa': 21, 'bagian': 22, 'anak': 23, 'saja': 24, 'antrian': 25, 'sungguh': 26, 'saya': 27, 'terutama': 28, 'karena': 29, 'katanya': 30, 'ramah': 31, 'pelayanannya': 32, 'ruang': 33, 'jam': 34, 'saat': 35, 'staf': 36, 'fasilitas': 37, 'informasi': 38, 'dapat': 39, 'atas': 40, 'petugas': 41, 'beberapa': 42, 'jalan': 43, 'semua': 44, 'ac': 45, 'dokter': 46, 'orang': 47, 'menit': 48, 'memberikan': 49, 'bersih': 50, 'rawat': 51, 'banyak': 52, 'jelas': 53, 'sampai': 54, 'biasanya': 55, 'motor': 56, 'duduk': 57, 'selamat': 58, 'kesehatan': 59, 'perlu': 60, 'sama': 61, 'kelas': 62, 'sblm': 63, 'lahir': 64, 'lahiran': 65, 'trs': 66, 'baby': 67, 'nicu': 68, 'segi': 69, 'ragu': 70, 'cmn': 71, 'residen': 72, 'yg': 73, 'jaga': 74, 'kurang': 75, 'inf

In [61]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, min_df=1, max_df=0.95,ngram_range=(1,2))  
X_tfidf = tfidf_vectorizer.fit_transform(rs["Ulasan_Clean"])

# Cek ukuran data
print("Shape TF-IDF Matrix:", X_tfidf.shape)
print("Kata-kata dalam TF-IDF:", tfidf_vectorizer.get_feature_names_out()[:50])  



Shape TF-IDF Matrix: (3461, 2350)
Kata-kata dalam TF-IDF: ['ac' 'ac dan' 'ac dengan' 'ac di' 'ac tidak' 'ada' 'ada ac' 'ada akses'
 'ada alarm' 'ada dapur' 'ada dekorasi' 'ada dispenser' 'ada fasilitas'
 'ada hiasan' 'ada jam' 'ada jendela' 'ada kalender' 'ada kamera'
 'ada kolam' 'ada layanan' 'ada lukisan' 'ada meja' 'ada pengharum'
 'ada ramah' 'ada ruang' 'ada sistem' 'ada tangga' 'ada tempat' 'ada unit'
 'ada yang' 'administrasi' 'administrasi cukup' 'agustus' 'agustus sampai'
 'air' 'air galon' 'air hangat' 'air mengalir' 'air mineral' 'air minum'
 'air panas' 'akhir' 'akhir pekan' 'akreditasi' 'akreditasi rutin' 'akses'
 'akses balkon' 'akses lift' 'alami' 'alami yang']


In [62]:
X_tfidf = tfidf_vectorizer.fit_transform(rs["Ulasan_Clean"])
print("Shape TF-IDF Matrix:", X_tfidf.shape)
y = rs["Sentimen"]

Shape TF-IDF Matrix: (3461, 2350)


# **6. Pelatihan machine learning**

In [63]:
# Pembagian data 80/20 dan 70/30
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [64]:
# Skema 1: Random Forest dengan split 80/20
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_80, y_train_80)
y_pred_rf = model_rf.predict(X_test_80)
print("\n🎯 Random Forest (80/20) Classification Report:")
print(classification_report(y_test_80, y_pred_rf, zero_division=1))


🎯 Random Forest (80/20) Classification Report:
              precision    recall  f1-score   support

     Negatif       0.89      1.00      0.94       265
      Netral       1.00      0.60      0.75       147
     Positif       0.92      1.00      0.96       281

    accuracy                           0.91       693
   macro avg       0.93      0.87      0.88       693
weighted avg       0.92      0.91      0.91       693



In [65]:
# Skema 2: SVM dengan split 70/30
model_svm = SVC(kernel='linear', probability=True)
model_svm.fit(X_train_70, y_train_70)
y_pred_svm = model_svm.predict(X_test_70)
print("\n🎯 SVM (70/30) Classification Report:")
print(classification_report(y_test_70, y_pred_svm, zero_division=1))


🎯 SVM (70/30) Classification Report:
              precision    recall  f1-score   support

     Negatif       0.89      1.00      0.94       388
      Netral       1.00      0.59      0.74       234
     Positif       0.90      1.00      0.95       417

    accuracy                           0.91      1039
   macro avg       0.93      0.86      0.87      1039
weighted avg       0.92      0.91      0.90      1039



# **7. Pelatihan model deep learning**

In [66]:
# Label mapping dari sebelumnya
label_mapping = {'Negatif': 0, 'Netral': 1, 'Positif': 2}
labels = rs["Sentimen"].map(label_mapping).values
labels_encoded = to_categorical(labels, num_classes=3).astype("float32")

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    padded_sequences, labels_encoded, test_size=0.2, random_state=42
)


In [67]:
embedding_dim = 128
lstm_units = 64

model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=50),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(3, activation="softmax")
])


model_lstm.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model_lstm.summary()
y_int = rs["Sentimen"].map(label_mapping).values
class_weights = compute_class_weight('balanced', classes=np.unique(y_int), y=y_int)

# Training
model_lstm.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 50, 128)           640000    
                                                                 
 lstm_6 (LSTM)               (None, 50, 64)            49408     
                                                                 
 dropout_8 (Dropout)         (None, 50, 64)            0         
                                                                 
 lstm_7 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 3)                 195       
                                                                 
Total params: 722627 (2.76 MB)
Trainable params: 72262

<keras.src.callbacks.History at 0x205c690ced0>

In [68]:
y_pred = model_lstm.predict(X_test_lstm)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test_lstm, axis=1)

print(classification_report(y_true, y_pred_labels, target_names=["Negatif", "Netral", "Positif"]))

              precision    recall  f1-score   support

     Negatif       0.89      1.00      0.94       265
      Netral       1.00      0.60      0.75       147
     Positif       0.92      1.00      0.96       281

    accuracy                           0.91       693
   macro avg       0.93      0.87      0.88       693
weighted avg       0.92      0.91      0.91       693



In [69]:
def predict_sentiment(text, tokenizer, model, max_length=50, threshold=0.5):
    # Step 1: Bersihkan teks
    text_clean = clean_text(text)
    
    # Step 2: Tokenisasi & Padding
    seq = tokenizer.texts_to_sequences([text_clean])
    
    if len(seq[0]) == 0:
        return "Teks tidak dikenali oleh model (hasil tokenisasi kosong)."
    
    pad_seq = pad_sequences(seq, maxlen=max_length, padding='post')
    
    # Step 3: Prediksi
    pred = model.predict(pad_seq, verbose=0)
    label_index = np.argmax(pred)
    confidence = np.max(pred)

    # Step 4: Mapping label
    reverse_label_mapping = {0: "Negatif", 1: "Netral", 2: "Positif"}
    predicted_label = reverse_label_mapping[label_index]

    # Step 5: Output + Confidence Check
    if confidence < threshold:
        return f"Model kurang yakin (confidence: {confidence:.2f}). Prediksi: {predicted_label}"
    else:
        return f"{predicted_label} (confidence: {confidence:.2f})"


In [71]:
predict_sentiment("pelayanan bagus staf dan dokternya ramah", tokenizer, model_lstm)
# Output: Positif 


'Positif (confidence: 0.91)'

In [41]:
predict_sentiment("Tidak tersedia layanan antar-jemput", tokenizer, model_lstm)
# Output: Netral 


'Netral (confidence: 1.00)'

In [73]:
predict_sentiment("tidak ada ramah sopan santunnya jutek ketus dan staff di bagian register main game doang", tokenizer, model_lstm)
# Output: Negatif


'Negatif (confidence: 0.92)'

# **8. Export pkl**

In [74]:
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']