<a href="https://colab.research.google.com/github/Daalleee/Natural-Language-Processing-NLP-/blob/main/Pertemuan_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Naive Bayes 1**

In [11]:
from collections import Counter
import math

# Data Latihan
spam_docs = ["Promo besar hari ini diskon 50%!", "Gratis hadiah untuk pelanggan setia!", "Segera klaim hadiah spesial kamu!"]
not_spam_docs = ["Halo, bagaimana kabarmu hari ini?", "Jangan lupa meeting besok pukul 10.00", "Dokumen penting sudah dikirim ke email"]

# Tokenisasi
def tokenize(text):
    return text.lower().replace("!", "").replace("%", "").split()

spam_words = [word for doc in spam_docs for word in tokenize(doc)]
not_spam_words = [word for doc in not_spam_docs for word in tokenize(doc)]

# Hitung Probabilitas
spam_counts = Counter(spam_words)
not_spam_counts = Counter(not_spam_words)
V = len(set(spam_words + not_spam_words))
total_spam = len(spam_words)
total_not_spam = len(not_spam_words)

def get_prob(word, category_counts, total_count):
    return (category_counts.get(word, 0) + 1) / (total_count + V)

# Prediksi Email Baru
email = "Hadiah besar gratis untuk kamu!"
words = tokenize(email)
spam_prob = math.prod([get_prob(word, spam_counts, total_spam) for word in words])
not_spam_prob = math.prod([get_prob(word, not_spam_counts, total_not_spam) for word in words])
print("Spam Probability:", spam_prob)
print("Not Spam Probability:", not_spam_prob)
print("Prediction:", "Spam" if spam_prob > not_spam_prob else "Not Spam")

Spam Probability: 2.0929167208772063e-07
Not Spam Probability: 3.9245856642232505e-09
Prediction: Spam


## **Naive Bayes 2**

In [12]:
import numpy as np
from collections import Counter
import math

# Dataset
texts = [
    "Promo besar hari ini diskon 50%!", "Gratis hadiah untuk pelanggan setia!", "Segera klaim hadiah spesial kamu!", # spam
    "Halo, bagaimana kabarmu hari ini?", "Jangan lupa meeting besok pukul 10.00", "Dokumen penting sudah dikirim ke email" # Not spam
]
labels = ["spam", "spam", "spam", "not spam", "not spam", "not spam"]

# 1. Tokenisasi dan Stopword Removal
stopwords = ["ke"]
def preprocess(text):
    tokens = text.lower().split()
    return [token for token in tokens if token not in stopwords]
processed_texts = [preprocess(text) for text in texts]

# 2. Buat Vocabulary
vocab = {}
index = 0
for text in processed_texts:
    for token in text:
        if token not in vocab:
            vocab[token] = index
            index += 1

# 3. Konversi ke Vektor (Bag of Words)
def text_to_vector(tokens, vocab):
    vector = [0] * len(vocab)
    for token in tokens:
        if token in vocab:
            vector[vocab[token]] += 1
    return vector

X = [text_to_vector(tokens, vocab) for tokens in processed_texts]
y = [1 if label == "not spam" else 0 for label in labels]

class NaiveBayes:
    def __init__(self, alpha=1):
        self.alpha = alpha # Laplace smoothing

    def fit(self, X, y):
        n_samples, n_features = len(X), len(X[0])
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        self.priors = np.zeros(n_classes)
        for c in self.classes:
            self.priors[c] = (sum(y == c) ) / (n_samples )
            #self.priors[c] = (sum(y == c) + self.alpha) / (n_samples + self.alpha * n_classes)
        self.likelihoods = np.zeros((n_classes, n_features))
        for c in self.classes:
            X_c = [X[i] for i in range(n_samples) if y[i] == c]
            total_words_c = sum(sum(x) for x in X_c)
            for j in range(n_features):
                count_j = sum(x[j] for x in X_c)
                self.likelihoods[c][j] = (count_j + self.alpha) / (total_words_c + self.alpha * n_features)
        print(self.likelihoods)

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for c in self.classes:
                prior = self.priors[c]
                likelihood = math.prod(self.likelihoods[c][j] for j, val in enumerate(x) if val > 0)
                posteriors.append(prior * likelihood)
            predictions.append(self.classes[np.argmax(posteriors)])
            print(posteriors)
        return predictions

# Contoh Penggunaan
nb = NaiveBayes(alpha=1)
nb.fit(X, y)
test_text = preprocess("Hadiah besar gratis untuk kamu!")
test_vector = text_to_vector(test_text, vocab)
print(nb.predict([test_vector]))

[[0.04347826 0.04347826 0.04347826 0.04347826 0.04347826 0.04347826
  0.04347826 0.06521739 0.04347826 0.04347826 0.04347826 0.04347826
  0.04347826 0.04347826 0.04347826 0.02173913 0.02173913 0.02173913
  0.02173913 0.02173913 0.02173913 0.02173913 0.02173913 0.02173913
  0.02173913 0.02173913 0.02173913 0.02173913 0.02173913 0.02173913]
 [0.02173913 0.02173913 0.04347826 0.02173913 0.02173913 0.02173913
  0.02173913 0.02173913 0.02173913 0.02173913 0.02173913 0.02173913
  0.02173913 0.02173913 0.02173913 0.04347826 0.04347826 0.04347826
  0.04347826 0.04347826 0.04347826 0.04347826 0.04347826 0.04347826
  0.04347826 0.04347826 0.04347826 0.04347826 0.04347826 0.04347826]]
[np.float64(1.1652579733553663e-07), np.float64(2.4276207778236796e-09)]
[np.int64(0)]


## **SVM**

In [13]:
import numpy as np

texts_train = [
    "Promo besar hari ini diskon 50%!", "Gratis hadiah untuk pelanggan setia!", "Segera klaim hadiah spesial kamu!",
    "Halo, bagaimana kabarmu hari ini?", "Jangan lupa meeting besok pukul 10.00", "Dokumen penting sudah dikirim ke email"
]
labels_train = ["spam", "spam", "spam", "not spam", "not spam", "not spam"]
texts_test = [
    "Kamu mendapatkan hadiah besar!", "Besok ada ujian penting!"
]
stopwords = ["ke", "di", "dari", "untuk", "pada"]

def preprocess(text):
    tokens = text.lower().split()
    return [token for token in tokens if token not in stopwords]

processed_train = [preprocess(text) for text in texts_train]
processed_test = [preprocess(text) for text in texts_test]

vocab_train = {}
index = 0
for text in processed_train:
    for token in text:
        if token not in vocab_train:
            vocab_train[token] = index
            index += 1
print(f"Vocabulary dari training: {vocab_train}")

def text_to_vector(tokens, vocab):
    vector = [0] * len(vocab) # Gunakan panjang vocab dari training
    for token in tokens:
        if token in vocab:
            vector[vocab[token]] += 1
    return vector

X_train = np.array([text_to_vector(tokens, vocab_train) for tokens in processed_train])
X_test = np.array([text_to_vector(tokens, vocab_train) for tokens in processed_test]) # Gunakan vocab_train
y = np.array([1 if label == "not spam" else -1 for label in labels_train])

print("\n X_train (BoW dari training data):")
print(X_train)
print("\n X_test (BoW dari testing data):")
print(X_test)

# Inisialisasi bobot dan bias
w = np.zeros(X_train.shape[1]) # Panjang bobot = jumlah fitur
b = 0
eta = 1 # Learning rate
epochs = 5 # Jumlah iterasi pembelajaran

# Training SVM menggunakan SGD dengan Subgradien Hinge Loss
for epoch in range(epochs):
    print(f"\n Epoch {epoch + 1}")
    for i in range(len(X_train)):
        # Mengecek kondisi Hinge Loss: y * (w·x + b) < 1
        if y[i] * (np.dot(w, X_train[i]) + b) < 1:
            # Update bobot jika salah klasifikasi atau berada dalam margin
            w = w + eta * y[i] * X_train[i]
            b = b + eta * y[i]
            print(f" ➜ Update: w = {w}, b = {b}")
        else:
            print(f" ➜ Tidak ada update untuk sampel ke-{i+1}")

# Hasil akhir
print("\nModel SVM selesai dilatih!")
print(f"Bobot akhir: {w}")
print(f"Bias akhir: {b}")

# Contoh Prediksi
def predict(X_test):
    return np.sign(np.dot(X_test, w) + b)

predictions = predict(X_test)
print("\nPrediksi untuk sampel uji:")
print(predictions)
for i in range(len(predictions)):
    label = "Spam" if predictions[i] == 1 else "Not Spam"
    print(f" - Sampel {i+1}: {label}")

Vocabulary dari training: {'promo': 0, 'besar': 1, 'hari': 2, 'ini': 3, 'diskon': 4, '50%!': 5, 'gratis': 6, 'hadiah': 7, 'pelanggan': 8, 'setia!': 9, 'segera': 10, 'klaim': 11, 'spesial': 12, 'kamu!': 13, 'halo,': 14, 'bagaimana': 15, 'kabarmu': 16, 'ini?': 17, 'jangan': 18, 'lupa': 19, 'meeting': 20, 'besok': 21, 'pukul': 22, '10.00': 23, 'dokumen': 24, 'penting': 25, 'sudah': 26, 'dikirim': 27, 'email': 28}

 X_train (BoW dari training data):
[[1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1]]

 X_test (BoW dari testing data):
[[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]

 Epoch 1
 ➜ Update: w = [

## **Random Forest**

In [14]:
import random

# Dataset
data = [
    {"hadiah": 1, "besar": 1, "gratis": 1, "untukmu": 1, "hari": 1, "ini": 1, "besok": 0, "kita": 0, "ada": 0, "pertemuan": 0, "kampus": 0, "selamat": 0, "anda": 0, "memenangkan": 0, "jangan": 0, "lupa": 0, "tugas": 0, "harus": 0, "dikumpulkan": 0, "terpilih": 0, "spesial": 0, "label": 1},
    {"hadiah": 0, "besar": 0, "gratis": 0, "untukmu": 0, "hari": 0, "ini": 0, "besok": 1, "kita": 1, "ada": 1, "pertemuan": 1, "kampus": 1, "selamat": 0, "anda": 0, "memenangkan": 0, "jangan": 0, "lupa": 0, "tugas": 0, "harus": 0, "dikumpulkan": 0, "terpilih": 0, "spesial": 0, "label": -1},
    {"hadiah": 1, "besar": 1, "gratis": 0, "untukmu": 0, "hari": 0, "ini": 0, "besok": 0, "kita": 0, "ada": 0, "pertemuan": 0, "kampus": 0, "selamat": 1, "anda": 1, "memenangkan": 1, "jangan": 0, "lupa": 0, "tugas": 0, "harus": 0, "dikumpulkan": 0, "terpilih": 0, "spesial": 0, "label": 1},
    {"hadiah": 0, "besar": 0, "gratis": 0, "untukmu": 0, "hari": 0, "ini": 0, "besok": 0, "kita": 0, "ada": 0, "pertemuan": 0, "kampus": 0, "selamat": 0, "anda": 0, "memenangkan": 0, "jangan": 1, "lupa": 1, "tugas": 1, "harus": 1, "dikumpulkan": 1, "terpilih": 0, "spesial": 0, "label": -1},
    {"hadiah": 1, "besar": 0, "gratis": 0, "untukmu": 0, "hari": 0, "ini": 0, "besok": 0, "kita": 0, "ada": 0, "pertemuan": 0, "kampus": 0, "selamat": 0, "anda": 1, "memenangkan": 0, "jangan": 0, "lupa": 0, "tugas": 0, "harus": 0, "dikumpulkan": 0, "terpilih": 1, "spesial": 1, "label": 1}
]

# Fungsi untuk menghitung Gini Impurity
def gini_impurity(data):
    total = len(data)
    if total == 0:
        return 0
    spam_count = sum(1 for row in data if row["label"] == 1)
    not_spam_count = total - spam_count
    p_spam = spam_count / total
    p_not_spam = not_spam_count / total
    return 1 - (p_spam ** 2 + p_not_spam ** 2)

# Fungsi untuk membagi dataset berdasarkan fitur tertentu
def split_data(data, feature):
    left = [row for row in data if row[feature] == 0]
    right = [row for row in data if row[feature] == 1]
    return left, right

# Fungsi untuk menemukan fitur terbaik untuk split
def best_split(data):
    best_feature = None
    best_gini = 1
    best_left, best_right = None, None
    for feature in data[0].keys():
        if feature == "label":
            continue
        left, right = split_data(data, feature)
        gini_left = gini_impurity(left)
        gini_right = gini_impurity(right)
        gini_split = (len(left) / len(data)) * gini_left + (len(right) / len(data)) * gini_right
        if gini_split < best_gini:
            best_gini = gini_split
            best_feature = feature
            best_left, best_right = left, right
    return best_feature, best_left, best_right

# Kelas Decision Tree
class DecisionTree:
    def __init__(self, depth=2):
        self.depth = depth
        self.tree = None

    def build_tree(self, data, depth=0):
        if len(set(row["label"] for row in data)) == 1 or depth >= self.depth:
            return {"prediction": max(set(row["label"] for row in data), key=[row["label"] for row in data].count)}
        feature, left, right = best_split(data)
        if not left or not right:
            return {"prediction": max(set(row["label"] for row in data), key=[row["label"] for row in data].count)}
        return {
            "feature": feature,
            "left": self.build_tree(left, depth + 1),
            "right": self.build_tree(right, depth + 1)
        }

    def fit(self, data):
        self.tree = self.build_tree(data)

    def predict_one(self, row, node):
        if "prediction" in node:
            return node["prediction"]
        if row[node["feature"]] == 0:
            return self.predict_one(row, node["left"])
        else:
            return self.predict_one(row, node["right"])

    def predict(self, data):
        return [self.predict_one(row, self.tree) for row in data]

class RandomForest:
    def __init__(self, n_trees=3, sample_size=0.8, depth=2):
        self.n_trees = n_trees
        self.sample_size = sample_size
        self.depth = depth
        self.trees = []

    def bootstrap_sample(self, data):
        return random.sample(data, int(len(data) * self.sample_size))

    def fit(self, data):
        for _ in range(self.n_trees):
            sample = self.bootstrap_sample(data)
            tree = DecisionTree(depth=self.depth)
            tree.fit(sample)
            self.trees.append(tree)

    def predict_one(self, row):
        predictions = [tree.predict_one(row, tree.tree) for tree in self.trees]
        return max(set(predictions), key=predictions.count)

    def predict(self, data):
        return [self.predict_one(row) for row in data]

rf = RandomForest(n_trees=3, depth=2)
rf.fit(data)

# Data uji
test_sms = {"hadiah": 1, "besar": 0, "gratis": 1, "untukmu": 0, "hari": 0, "ini": 0, "besok": 0, "kita": 0, "ada": 0, "pertemuan": 0, "kampus": 0, "selamat": 0, "anda": 1, "memenangkan": 1, "jangan": 0, "lupa": 0, "tugas": 0, "harus": 0, "dikumpulkan": 0, "terpilih": 0, "spesial": 0}

# Prediksi menggunakan Random Forest
prediction = rf.predict_one(test_sms)
print("Prediksi:", "Spam" if prediction == 1 else "Not Spam")

Prediksi: Spam


## **Logistic Regression**

In [15]:
import math

# Dataset (fitur dan label)
X = [
    [1, 0], # Hadiah besar menanti kamu! → Spam
    [0, 1], # Tugas harus dikumpulkan! → Not Spam
    [1, 0], # Gratis hadiah untukmu! → Spam
    [0, 1], # Jangan lupa tugas kuliah! → Not Spam
    [1, 0], # Selamat! Kamu mendapat hadiah! → Spam
]
y = [1, 0, 1, 0, 1] # 1 = Spam, 0 = Not Spam

# Fungsi sigmoid
def sigmoid(z):
    return 1 / (1 + math.exp(-z))

# Fungsi prediksi
def predict(X, w, b):
    preds = []
    for x in X:
        z = sum(w[i] * x[i] for i in range(len(x))) + b
        preds.append(sigmoid(z))
    return preds

# Fungsi pembaruan bobot dengan Gradient Descent
def train_logistic_regression(X, y, lr=0.1, epochs=100):
    w = [0.0 for _ in range(len(X[0]))] # inisialisasi bobot
    b = 0.0 # bias
    for epoch in range(epochs):
        for i in range(len(X)):
            z = sum(w[j] * X[i][j] for j in range(len(X[0]))) + b
            pred = sigmoid(z)
            error = y[i] - pred
            # update bobot dan bias
            for j in range(len(X[0])):
                w[j] += lr * error * X[i][j]
            b += lr * error
    return w, b

# Training
weights, bias = train_logistic_regression(X, y, lr=0.1, epochs=100)

# Prediksi contoh baru
test_input = [0, 1] # Teks: "Tugas penting menanti kamu"
z = sum(weights[i] * test_input[i] for i in range(len(test_input))) + bias
prob = sigmoid(z)
print("Probabilitas Spam:", round(prob, 4))
print("Prediksi:", "Spam" if prob >= 0.5 else "Not Spam")

Probabilitas Spam: 0.0521
Prediksi: Not Spam


## **BERT dengan Logistic Regression**

In [16]:
import torch
from transformers import AutoTokenizer, AutoModel
import math

# 1. Load IndoBERT (tanpa classifier head)
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
bert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

# 2. Dataset: teks dan label
texts = [
    "Hadiah besar menanti kamu!", # Spam
    "Tugas harus dikumpulkan!", # Not Spam
    "Gratis hadiah untukmu!", # Spam
    "Jangan lupa tugas kuliah!", # Not Spam
    "Selamat! Kamu mendapat hadiah!" # Spam
]
labels = [1, 0, 1, 0, 1]

# 3. Fungsi: Ambil embedding [CLS] dari BERT
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy() # shape: (768,)
    return cls_embedding

# 4. Ambil semua embedding
X = [get_cls_embedding(text) for text in texts]
y = labels

# 5. Fungsi Sigmoid
def sigmoid(z):
    return 1 / (1 + math.exp(-z))

# 6. Logistic Regression Manual
def train_logistic_regression(X, y, lr=0.01, epochs=10):
    w = [0.0] * len(X[0]) # 768 dimensi
    b = 0.0
    for epoch in range(epochs):
        for i in range(len(X)):
            z = sum(w[j] * X[i][j] for j in range(len(w))) + b
            pred = sigmoid(z)
            error = y[i] - pred
            for j in range(len(w)):
                w[j] += lr * error * X[i][j]
            b += lr * error
    return w, b

# 7. Latih model
weights, bias = train_logistic_regression(X, y)

# 8. Prediksi input baru
def predict(text):
    x = get_cls_embedding(text)
    z = sum(weights[i] * x[i] for i in range(len(x))) + bias
    prob = sigmoid(z)
    return prob, "Spam" if prob >= 0.5 else "Not Spam"

# 9. Uji prediksi
test_text = "Tugas penting menanti kamu!"
prob, label = predict(test_text)
print(f"Probabilitas Spam: {prob:.4f} → Prediksi: {label}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Probabilitas Spam: 0.4866 → Prediksi: Not Spam
