# **1. Import Library**

In [68]:
import pandas as pd
import re
import string
import nltk
import numpy as np
import joblib
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder




# **2. Load Dataset**

In [69]:
rs = pd.read_csv("dataset/final-dataset.csv", dtype=str) 
rs.head()

Unnamed: 0,Nama,Rating,Ulasan
0,Nia Devi,4 bintang,"Dari sblm lahir, lahiran trs baby juga di rawa..."
1,titania purnamasari,1 bintang,"Gedung elit, parkir di basement sulit. Baru am..."
2,Farid Add,5 bintang,Sungguh luar biasa bagi saya untuk pelayanan t...
3,Susi Mashanafi,1 bintang,"Tes napza disini, udah selesai tinggal nunggu ..."
4,Basuki Dwi Putranto,5 bintang,Selamat atas telah beroperasinya Gedung Baru K...


# **3. Memahami Struktur Data**

In [70]:
rs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Nama    3000 non-null   object
 1   Rating  3000 non-null   object
 2   Ulasan  3000 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


# **4. Mengecheck data kosong**

In [71]:
#mengecheck data hilang
rs.isnull().sum()

Nama      0
Rating    0
Ulasan    0
dtype: int64

# **5. Ekstraksi fitur dan pelabelan data**

In [72]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [73]:

def clean_rating(rating):
    if pd.isna(rating) or rating.strip() == "":  
        return None
    angka = re.findall(r"\d+", rating) 
    return int(angka[0]) if angka else None 

rs["Rating"] = rs["Rating"].astype(str).apply(clean_rating)


In [74]:
# Pastikan rating dalam bentuk string dulu
rs["Rating"] = rs["Rating"].astype(str)

# Ekstrak angka dari string "5 bintang", "2 bintang", dst
rs["Rating"] = rs["Rating"].str.extract('(\d)').astype(float)

# Label sentimen
def label_sentiment(rating):
    if rating >= 4:
        return "Positif"
    else:
        return "Negatif"

rs["Sentimen"] = rs["Rating"].apply(label_sentiment)

#data netral atau 3 dijadikan satu dengan negatif karena jumlah terlalu sedikit dan memperngaruhi pelatihan model


In [75]:
print(rs["Sentimen"].value_counts())

Sentimen
Negatif    1653
Positif    1347
Name: count, dtype: int64


In [76]:
print(rs["Rating"].value_counts())

Rating
1.0    1353
5.0    1012
4.0     335
3.0     300
Name: count, dtype: int64


In [77]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Normalisasi slang / kata tidak baku
    slang_map = {
        "ga": "tidak",
        "gak": "tidak",
        "nggak": "tidak",
        "ngga": "tidak",
        "cuman": "cuma",
        "udh": "sudah",
        "dr": "dari",
        "bgt": "banget",
    }
    for slang, formal in slang_map.items():
        text = re.sub(rf'\b{slang}\b', formal, text)

    # Gabungkan frasa negasi umum
    negation_patterns = [
        (r'\btidak nyaman\b', 'tidak_nyaman'),
        (r'\btidak enak\b', 'tidak_enak'),
        (r'\btidak bagus\b', 'tidak_bagus'),
        (r'\btidak bersih\b', 'tidak_bersih'),
        (r'\btidak ramah\b', 'tidak_ramah'),
        (r'\btidak kelar\b', 'tidak_kelar'),
        (r'\btidak jelas\b', 'tidak_jelas'),
        (r'\bburuk sekali\b', 'buruk_sekali'),
        (r'\bjangan ke sini\b', 'jangan_ke_sini'),
    ]

    for pattern, replacement in negation_patterns:
        text = re.sub(pattern, replacement, text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text


rs["Ulasan_Clean"] = rs["Ulasan"].astype(str).apply(clean_text)
print(rs[["Ulasan", "Ulasan_Clean"]].sample(5))


                                                 Ulasan  \
2183  Kami sadar hanya pasien kelas 3 PBI. Tapi kami...   
978   Sungguh luar biasa bagi saya untuk pelayanan t...   
2743  Kami sadar hanya pasien kelas 3 PBI. Tapi kami...   
2826  Sungguh luar biasa bagi saya untuk pelayanan t...   
2662  Pelayanannya buruk sekali, pindah ke gedung ba...   

                                           Ulasan_Clean  
2183  kami sadar hanya pasien kelas pbi tapi kami ti...  
978   sungguh luar biasa bagi saya untuk pelayanan t...  
2743  kami sadar hanya pasien kelas pbi tapi kami ti...  
2826  sungguh luar biasa bagi saya untuk pelayanan t...  
2662  pelayanannya buruk_sekali pindah ke gedung bar...  


# *Tokenisasi Teks*

In [78]:
print(rs.columns)


Index(['Nama', 'Rating', 'Ulasan', 'Sentimen', 'Ulasan_Clean'], dtype='object')


In [79]:
# Inisialisasi tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(rs["Ulasan_Clean"]) # Melatih tokenizer
vocab_size = len(tokenizer.word_index) + 1


# Konversi teks menjadi urutan angka
sequences = tokenizer.texts_to_sequences(rs["Ulasan_Clean"])
# Padding sequences agar memiliki panjang yang sama
max_length = 50  # Panjang maksimum ulasan
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post", dtype="int32")
vocab_size = 5000
# Cek hasil tokenisasi
print(padded_sequences[:5])

[[  6  33  34  35  36  37  11   2  38   2  39   6  40  41   3  42  43  44
   45  46  47  48  49  50  12  51  52  17  13   4   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  7  53  14   2  54  55   8  56  57  14   2   7   8  58  59   3  60  61
   62  63  14  64  65  66  67  68   4   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 18  19  20  69  21  12   9  22   2  23  70  24  25   6  71  72  73  21
   74  75   5  76  77  78  79   9  10  18  19  20  80  81   4   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 82  83  84  85  86  87  88  89  10  26   3  90  26  91  92  93  94   3
   95  11  96  97  98  99 100   4   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [101 102 103 104   7   8 105 106 107 108   9  10 109  12 110 111 112   5
   24  22   2 113   5 114   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   

In [80]:
print("Word Index:", tokenizer.word_index)


Word Index: {'<OOV>': 1, 'di': 2, 'tidak': 3, '…': 4, 'dan': 5, 'dari': 6, 'gedung': 7, 'baru': 8, 'pelayanan': 9, 'yang': 10, 'juga': 11, 'untuk': 12, 'pasien': 13, 'parkir': 14, 'dengan': 15, 'kami': 16, 'ke': 17, 'sungguh': 18, 'luar': 19, 'biasa': 20, 'saya': 21, 'terutama': 22, 'bagian': 23, 'anak': 24, 'karena': 25, 'katanya': 26, 'ramah': 27, 'hanya': 28, 'saja': 29, 'antrian': 30, 'pelayanannya': 31, 'ada': 32, 'sblm': 33, 'lahir': 34, 'lahiran': 35, 'trs': 36, 'baby': 37, 'rawat': 38, 'nicu': 39, 'segi': 40, 'fasilitas': 41, 'ragu': 42, 'cmn': 43, 'beberapa': 44, 'dokter': 45, 'residen': 46, 'yg': 47, 'jaga': 48, 'kurang': 49, 'informatif': 50, 'kasi': 51, 'penjelasan': 52, 'elit': 53, 'basement': 54, 'sulit': 55, 'ambil': 56, 'karcis': 57, 'motor': 58, 'parkirnya': 59, 'jelas': 60, 'ngehalangi': 61, 'jalan': 62, 'petugas': 63, 'cuma': 64, 'orang': 65, 'duduk': 66, 'diplang': 67, 'keluar': 68, 'bagi': 69, 'bedah': 70, 'agustus': 71, 'sampai': 72, 'pebruari': 73, 'merasakan': 7

In [82]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, min_df=1, max_df=0.95,ngram_range=(1,2))  
X_tfidf = tfidf_vectorizer.fit_transform(rs["Ulasan_Clean"])

# Cek ukuran data
print("Shape TF-IDF Matrix:", X_tfidf.shape)
print("Kata-kata dalam TF-IDF:", tfidf_vectorizer.get_feature_names_out()[:50])  



Shape TF-IDF Matrix: (3000, 353)
Kata-kata dalam TF-IDF: ['ac' 'ac dengan' 'ada' 'ada ramah' 'ada yang' 'agustus' 'agustus sampai'
 'ambil' 'ambil karcis' 'anak' 'anak karena' 'anak terutama' 'anjir'
 'anjir very' 'antrian' 'antrian karena' 'atas' 'atas telah' 'baby'
 'baby juga' 'bagi' 'bagi saya' 'bagian' 'bagian bedah' 'bagian register'
 'bagus' 'bagus staf' 'bahagia' 'bahagia itu' 'bali' 'bali dan' 'banyak'
 'baru' 'baru ambil' 'baru juga' 'baru kia' 'baru motor' 'basement'
 'basement sulit' 'beberapa' 'beberapa dokter' 'bedah' 'bedah anak'
 'benalu' 'benalu kami' 'ber' 'ber ac' 'berkat' 'berkat pelayanan'
 'beroperasinya']


In [83]:
X_tfidf = tfidf_vectorizer.fit_transform(rs["Ulasan_Clean"])
print("Shape TF-IDF Matrix:", X_tfidf.shape)
y = rs["Sentimen"]

Shape TF-IDF Matrix: (3000, 353)


# **6. Pelatihan machine learning**

In [84]:
# Pembagian data 80/20 dan 70/30
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [85]:
# Skema 1: Random Forest dengan split 80/20
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_80, y_train_80)
y_pred_rf = model_rf.predict(X_test_80)
print("\n🎯 Random Forest (80/20) Classification Report:")
print(classification_report(y_test_80, y_pred_rf, zero_division=1))


🎯 Random Forest (80/20) Classification Report:
              precision    recall  f1-score   support

     Negatif       1.00      0.90      0.95       344
     Positif       0.89      1.00      0.94       256

    accuracy                           0.94       600
   macro avg       0.94      0.95      0.94       600
weighted avg       0.95      0.94      0.95       600



In [86]:
# Skema 2: SVM dengan split 70/30
model_svm = SVC(kernel='linear', probability=True)
model_svm.fit(X_train_70, y_train_70)
y_pred_svm = model_svm.predict(X_test_70)
print("\n🎯 SVM (70/30) Classification Report:")
print(classification_report(y_test_70, y_pred_svm, zero_division=1))


🎯 SVM (70/30) Classification Report:
              precision    recall  f1-score   support

     Negatif       1.00      0.90      0.95       500
     Positif       0.89      1.00      0.94       400

    accuracy                           0.95       900
   macro avg       0.95      0.95      0.95       900
weighted avg       0.95      0.95      0.95       900



# **7. Pelatihan model deep learning**

In [87]:
# Label mapping dari sebelumnya
label_mapping = {'Negatif': 0, 'Positif': 1}
labels = rs["Sentimen"].map(label_mapping).values
labels_encoded = to_categorical(labels, num_classes=3).astype("float32")

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    padded_sequences, labels_encoded, test_size=0.2, random_state=42
)


In [88]:
embedding_dim = 128
lstm_units = 64

model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=50),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(3, activation="softmax")
])


model_lstm.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model_lstm.summary()
y_int = rs["Sentimen"].map(label_mapping).values
class_weights = compute_class_weight('balanced', classes=np.unique(y_int), y=y_int)

# Training
model_lstm.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 128)           640000    
                                                                 
 lstm_4 (LSTM)               (None, 50, 64)            49408     
                                                                 
 dropout_4 (Dropout)         (None, 50, 64)            0         
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 722627 (2.76 MB)
Trainable params: 72262

<keras.src.callbacks.History at 0x18fa566be50>

In [89]:
y_pred = model_lstm.predict(X_test_lstm)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test_lstm, axis=1)

print(classification_report(y_true, y_pred_labels, target_names=["Negatif", "Positif"]))



              precision    recall  f1-score   support

     Negatif       1.00      0.90      0.95       344
     Positif       0.89      1.00      0.94       256

    accuracy                           0.94       600
   macro avg       0.94      0.95      0.94       600
weighted avg       0.95      0.94      0.95       600



In [90]:
def predict_sentiment(text, tokenizer, model, max_length=50, threshold=0.5):
    # Step 1: Bersihkan teks
    text_clean = clean_text(text)
    
    # Step 2: Tokenisasi & Padding
    seq = tokenizer.texts_to_sequences([text_clean])
    
    if len(seq[0]) == 0:
        return "Teks tidak dikenali oleh model (hasil tokenisasi kosong)."
    
    pad_seq = pad_sequences(seq, maxlen=max_length, padding='post')
    
    # Step 3: Prediksi
    pred = model.predict(pad_seq, verbose=0)
    label_index = np.argmax(pred)
    confidence = np.max(pred)

    # Step 4: Mapping label
    reverse_label_mapping = {0: "Negatif", 1: "Positif"}
    predicted_label = reverse_label_mapping[label_index]

    # Step 5: Output + Confidence Check
    if confidence < threshold:
        return f"Model kurang yakin (confidence: {confidence:.2f}). Prediksi: {predicted_label}"
    else:
        return f"{predicted_label} (confidence: {confidence:.2f})"


In [91]:
predict_sentiment("pelayanan bagus staf dan dokternya ramah", tokenizer, model_lstm)
# Output: Positif 


'Positif (confidence: 0.92)'

In [106]:
predict_sentiment("tidak ada ramah sopan santunnya jutek ketus dan staff di bagian register main game doang, buruk sekali", tokenizer, model_lstm)
# Output: Negatif


'Negatif (confidence: 1.00)'

# **8. Export pkl**

In [107]:
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']