<a href="https://colab.research.google.com/github/Daalleee/Natural-Language-Processing-NLP-/blob/main/Pertemuan_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
# Ganti 'data.csv' dengan nama file Anda
file_path='dataKlas.csv'
# === LOAD & PERSIAPAN DATA ===
# parsing manual
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    next(f) # skip header
    for line in f:
        line = line.strip().strip('"')
        fields = line.split('\t')
        if len(fields) == 5:
          data.append(fields)
df = pd.DataFrame(data, columns=["text_id", "text_idn",
"text_eng", "sentiment", "emotion"])

In [4]:
# Bersihkan kolom 'emotion' dan 'sentiment'
df['emotion'] = df['emotion'].str.strip().str.lower()
df['sentiment'] = df['sentiment'].str.strip().str.lower()

# Tampilkan label unik
unique_sentiment = df['sentiment'].unique().tolist()
unique_emotion = df['emotion'].unique().tolist()
unique_sentiment, unique_emotion

# === PECAH JADI KALIMAT ===
import re

def split_sentences(text):
    # Pisah berdasarkan titik diikuti spasi atau akhir kalimat
    return [s.strip() for s in re.split(r'[.!?]+(?=\s|$)', text) if s.strip()]

texts_train = []
labels_train = []

for _, row in df.iterrows():
    # Lewati baris yang labelnya kosong
    if pd.isna(row['text_idn']) or pd.isna(row['sentiment']) or pd.isna(row['emotion']):
        continue

    sentences = split_sentences(row['text_idn'])
    for sentence in sentences:
        texts_train.append(sentence)
        label = row['sentiment'] + '+' + row['emotion']
        labels_train.append(label)


In [5]:
import random
import re

# Gabungkan teks dan label jadi satu list
data_pairs = list(zip(texts_train, labels_train))

random.seed(42)
random.shuffle(data_pairs)

# Split 80% train, 20% test
split_idx = int(0.8 * len(data_pairs))
train_data = data_pairs[:split_idx]
test_data = data_pairs[split_idx:]

texts_train_split = [x[0] for x in train_data]
labels_train_split = [x[1] for x in train_data]
texts_test_split = [x[0] for x in test_data]
labels_test_split = [x[1] for x in test_data]

# === TRAINING ===
classes = set(labels_train_split)
class_word_counts = {c: {} for c in classes}
total_words_in_class = {c: 0 for c in classes}
doc_count_per_class = {c: 0 for c in classes}

for text, label in zip(texts_train_split, labels_train_split):
    doc_count_per_class[label] += 1
    words = re.findall(r'\w+', text.lower())
    for w in words:
        class_word_counts[label][w] = class_word_counts[label].get(w, 0) + 1
        total_words_in_class[label] += 1

vocab = set()
for c in classes:
    vocab.update(class_word_counts[c].keys())

V = len(vocab)
total_docs = len(texts_train_split)

class_priors = {c: doc_count_per_class[c] / total_docs for c in classes}

print("Jumlah kelas:", classes)
print("Total kata unik (ukuran vocab):", V)
for c in classes:
    print(f"Kelas {c}: {doc_count_per_class[c]} dokumen, {total_words_in_class[c]} total kata")


Jumlah kelas: {'negative+sad', 'positive+happy', 'neutral+neutral', 'negative+anger', 'positive+love', 'negative+fear'}
Total kata unik (ukuran vocab): 1578
Kelas negative+sad: 93 dokumen, 1259 total kata
Kelas positive+happy: 62 dokumen, 566 total kata
Kelas neutral+neutral: 57 dokumen, 926 total kata
Kelas negative+anger: 84 dokumen, 940 total kata
Kelas positive+love: 50 dokumen, 383 total kata
Kelas negative+fear: 65 dokumen, 744 total kata


In [7]:
import math
import re

def predict(text):
    words = re.findall(r'\w+', text.lower())
    scores = {c: math.log(class_priors[c]) for c in classes}

    for c in classes:
        for w in words:
            count = class_word_counts[c].get(w, 0)
            prob_w_c = (count + 1) / (total_words_in_class[c] + V)
            scores[c] += math.log(prob_w_c)

    return max(scores, key=scores.get)

# Contoh prediksi
text = "kelebihan kamar luas dan ada balkon di setiap kamar yang menghadap kolam renang"
print(predict(text))


negative+fear


In [8]:
# Inisialisasi confusion matrix
confusion = {true: {pred: 0 for pred in classes} for true in classes}

correct = 0
for text, true_label in zip(texts_test_split, labels_test_split):
    pred_label = predict(text)
    confusion[true_label][pred_label] += 1
    if pred_label == true_label:
        correct += 1

# Hitung akurasi
accuracy = correct / len(texts_test_split)
print(f"\nAkurasi pada data uji: {accuracy:.2f}\n")

# Cetak confusion matrix
print("Confusion Matrix:")
header = "\t" + "\t".join(classes)
print(header)

for true in classes:
    row = [str(confusion[true][pred]) for pred in classes]
    print(f"{true}\t" + "\t".join(row))



Akurasi pada data uji: 0.32

Confusion Matrix:
	negative+sad	positive+happy	neutral+neutral	negative+anger	positive+love	negative+fear
negative+sad	9	2	0	2	1	2
positive+happy	4	6	1	0	3	0
neutral+neutral	4	1	4	8	0	0
negative+anger	11	2	2	3	0	6
positive+love	0	0	2	1	6	2
negative+fear	4	5	2	2	3	5


In [9]:
documents = texts_train

# Bangun vocabulary dan representasi dokumen
vocab = {}  # peta kata ke index
word_counts = []  # list of dict untuk hitungan kata per dokumen

for doc in documents:
    words = re.findall(r'\w+', doc.lower())
    # Hitung frekuensi kata di dokumen ini
    freq = {}
    for w in words:
        if w not in vocab:
            vocab[w] = len(vocab)  # tambahkan kata baru ke vocab
        idx = vocab[w]
        # Update hitungan kata (bisa simpan by index atau by word, di sini by index lebih efisien)
        freq[idx] = freq.get(idx, 0) + 1
    word_counts.append(freq)

D = len(documents)  # jumlah dokumen
V = len(vocab)      # ukuran kosakata

print("Total dokumen:", D)
print("Ukuran kosakata:", V)


Total dokumen: 514
Ukuran kosakata: 1863


In [10]:
import numpy as np

# Konversi dokumen-kata menjadi matriks dense numpy (D x V) untuk kemudahan NMF
# Warning: ini bisa besar. Jika V sangat besar, pertimbangkan pakai subset vocab.
D = len(documents)
V = len(vocab)

X = np.zeros((D, V))
for d, freq in enumerate(word_counts):
    for idx, count in freq.items():
        X[d, idx] = count

# Tentukan jumlah topik
K = 5
np.random.seed(0)
W = np.random.rand(D, K)
H = np.random.rand(K, V)

# Fungsi untuk menghitung rekonstruksi (opsional, bisa untuk memonitor error)
def reconstruct_error(X, W, H):
    return np.linalg.norm(X - W.dot(H))

# Iterasi update NMF
for it in range(50):
    # Update H
    numerator = W.T.dot(X)  # shape K x V
    denominator = W.T.dot(W).dot(H)  # shape K x V
    # Tambahkan kecil epsilon pada denominator untuk hindari div zero
    H *= numerator / (denominator + 1e-9)

    # Update W
    numerator = X.dot(H.T)  # shape D x K
    denominator = W.dot(H).dot(H.T)  # shape D x K
    W *= numerator / (denominator + 1e-9)

    if it % 10 == 0:
        err = reconstruct_error(X, W, H)
        print(f"Iterasi {it}, reconstruction error = {err:.2f}")


Iterasi 0, reconstruction error = 83.77
Iterasi 10, reconstruction error = 79.17
Iterasi 20, reconstruction error = 78.67
Iterasi 30, reconstruction error = 78.58
Iterasi 40, reconstruction error = 78.55


In [11]:
# Ambil top 10 kata untuk setiap topik
index_to_word = {idx: w for w, idx in vocab.items()}  # peta balik index ke kata

top_words_per_topic = {}

for k in range(K):
    # Urutkan index kata berdasarkan bobot H[k] dari terbesar ke terkecil
    top_indices = np.argsort(H[k, :])[::-1]  # descending sort
    top_words = [index_to_word[idx] for idx in top_indices[:10]]
    top_words_per_topic[k] = top_words
    print(f"Topik {k}: " + ", ".join(top_words))


Topik 0: saya, ini, kalo, game, bisa, gk, dulu, dari, malah, lagi
Topik 1: nya, juga, ada, jalan, masuk, ga, malah, udah, mod, dan
Topik 2: bisa, ke, dan, bug, sangat, map, gak, bagus, tidak, tapi
Topik 3: di, saya, tapi, perbaiki, yg, bus, bug, itu, pas, ada
Topik 4: yang, aku, ini, dan, jembatan, rawat, aja, w, buk, rasakan


In [12]:
import random

# Persiapkan struktur dokumen -> list of word indices per dokumen
documents_word_indices = []
for freq in word_counts:
    doc_indices = []
    for w_idx, count in freq.items():
        doc_indices.extend([w_idx] * count)
    documents_word_indices.append(doc_indices)

# Parameter LDA
K = 5
alpha = 0.1
beta = 0.01
D = len(documents_word_indices)
V = len(vocab)

# Inisialisasi count
doc_topic_counts = [[0] * K for _ in range(D)]
topic_word_counts = [[0] * V for _ in range(K)]
topic_counts = [0] * K
assignments = [[] for _ in range(D)]

# Random initialize topic for each token
random.seed(42)
for d, word_list in enumerate(documents_word_indices):
    for w_idx in word_list:
        k = random.randrange(K)
        assignments[d].append(k)
        doc_topic_counts[d][k] += 1
        topic_word_counts[k][w_idx] += 1
        topic_counts[k] += 1

# Gibbs sampling
iterations = 10
for it in range(iterations):
    for d, word_list in enumerate(documents_word_indices):
        for i, w_idx in enumerate(word_list):
            old_topic = assignments[d][i]

            # Decrement counts for old_topic
            doc_topic_counts[d][old_topic] -= 1
            topic_word_counts[old_topic][w_idx] -= 1
            topic_counts[old_topic] -= 1

            # Hitung distribusi posterior
            probabilities = []
            for k in range(K):
                p_doc = (doc_topic_counts[d][k] + alpha)
                p_word = (topic_word_counts[k][w_idx] + beta) / (topic_counts[k] + V * beta)
                probabilities.append(p_doc * p_word)

            # Normalisasi
            total_p = sum(probabilities)
            probabilities = [p / total_p for p in probabilities]

            # Sample new topic k berdasarkan distribusi probabilitas
            r = random.random()
            cumulative = 0.0
            new_topic = K - 1
            for k, p in enumerate(probabilities):
                cumulative += p
                if r <= cumulative:
                    new_topic = k
                    break

            # Assign new_topic
            assignments[d][i] = new_topic
            doc_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][w_idx] += 1
            topic_counts[new_topic] += 1

    print(f"Iterasi {it + 1} selesai")


Iterasi 1 selesai
Iterasi 2 selesai
Iterasi 3 selesai
Iterasi 4 selesai
Iterasi 5 selesai
Iterasi 6 selesai
Iterasi 7 selesai
Iterasi 8 selesai
Iterasi 9 selesai
Iterasi 10 selesai


In [13]:
# Ekstrak top 10 kata per topik dari hasil LDA
index_to_word = {idx: w for w, idx in vocab.items()}

for k in range(K):
    # Ambil 10 kata dengan count tertinggi di topik k
    top_indices = sorted(range(V), key=lambda v: topic_word_counts[k][v], reverse=True)
    top_words = [index_to_word[idx] for idx in top_indices[:10]]
    print(f"Topik {k}: " + ", ".join(top_words))


Topik 0: yang, dan, ada, tidak, saya, lagi, jalan, lebih, game, ini
Topik 1: di, aplikasi, ini, saya, juga, ada, tapi, ke, aja, sama
Topik 2: gak, nya, malah, dan, update, yg, dari, mau, ya, sekarang
Topik 3: di, nya, bisa, game, saya, tolong, bug, perbaiki, ada, bagus
Topik 4: aku, nya, ini, suka, ga, bagus, sendiri, dan, untuk, sangat
