<a href="https://colab.research.google.com/github/Daalleee/Natural-Language-Processing-NLP-/blob/main/Pertemuan_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import math

# Download tokenizer (sekali saja)
nltk.download('punkt')
nltk.download('punkt_tab')

# Korpus pelatihan
corpus = [
    "I love cats",
    "I love dogs",
    "I hate spiders",
    "She loves cats",
    "He loves dogs",
    "I love natural language processing"
]

# Gabungkan semua kalimat dan tokenisasi
tokens = []
for sentence in corpus:
    tokens.extend(word_tokenize(sentence.lower()))

print("Tokens:", tokens)


Tokens: ['i', 'love', 'cats', 'i', 'love', 'dogs', 'i', 'hate', 'spiders', 'she', 'loves', 'cats', 'he', 'loves', 'dogs', 'i', 'love', 'natural', 'language', 'processing']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Unigram counts
unigram_counts = Counter(tokens)

# Bigram counts
bigram_counts = defaultdict(int)
for i in range(len(tokens) - 1):
    bigram = (tokens[i], tokens[i+1])
    bigram_counts[bigram] += 1

print("Unigram:", dict(unigram_counts))
print("Bigram:", dict(bigram_counts))


Unigram: {'i': 4, 'love': 3, 'cats': 2, 'dogs': 2, 'hate': 1, 'spiders': 1, 'she': 1, 'loves': 2, 'he': 1, 'natural': 1, 'language': 1, 'processing': 1}
Bigram: {('i', 'love'): 3, ('love', 'cats'): 1, ('cats', 'i'): 1, ('love', 'dogs'): 1, ('dogs', 'i'): 2, ('i', 'hate'): 1, ('hate', 'spiders'): 1, ('spiders', 'she'): 1, ('she', 'loves'): 1, ('loves', 'cats'): 1, ('cats', 'he'): 1, ('he', 'loves'): 1, ('loves', 'dogs'): 1, ('love', 'natural'): 1, ('natural', 'language'): 1, ('language', 'processing'): 1}


In [None]:
def get_bigram_prob(word_prev, word_curr, unigram_counts, bigram_counts):
    # P(w_curr | w_prev) = C(w_prev, w_curr) / C(w_prev)
    numerator = bigram_counts[(word_prev, word_curr)]
    denominator = unigram_counts[word_prev]
    if denominator == 0:
        return 0.0
    return numerator / denominator

# Contoh penggunaan
print("P(cats | love) =", get_bigram_prob('love', 'cats', unigram_counts, bigram_counts))
print("P(dogs | love) =", get_bigram_prob('love', 'dogs', unigram_counts, bigram_counts))
print("P(pizza | love) =", get_bigram_prob('love', 'pizza', unigram_counts, bigram_counts))


P(cats | love) = 0.3333333333333333
P(dogs | love) = 0.3333333333333333
P(pizza | love) = 0.0


In [None]:
def predict_next_word(prev_word, vocab, unigram_counts, bigram_counts):
    candidates = {}
    for word in vocab:
        prob = get_bigram_prob(prev_word, word, unigram_counts, bigram_counts)
        if prob > 0:
            candidates[word] = prob
    # Urutkan kandidat dari yang terbesar
    sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    return sorted_candidates[:5]

vocab = list(unigram_counts.keys())
predictions = predict_next_word('love', vocab, unigram_counts, bigram_counts)
print("Prediksi setelah 'love':", predictions)


Prediksi setelah 'love': [('cats', 0.3333333333333333), ('dogs', 0.3333333333333333), ('natural', 0.3333333333333333)]


soal 1 hitung probabilitas

In [None]:
print("P(love | i) =", get_bigram_prob('i','love', unigram_counts, bigram_counts))
print("P(hate | i) =", get_bigram_prob('i','hate', unigram_counts, bigram_counts))
print("P(spiders | hate) =", get_bigram_prob('hate','spiders', unigram_counts, bigram_counts))


P(love | i) = 0.75
P(hate | i) = 0.25
P(spiders | hate) = 1.0


soal 2 trigram

In [None]:
# Hitung trigram
trigram_counts = defaultdict(int)
for i in range(len(tokens)-2):
    trigram = (tokens[i], tokens[i+1], tokens[i+2])
    trigram_counts[trigram] += 1

def get_trigram_prob(w1, w2, w3, bigram_counts, trigram_counts):
    # P(w3 | w1,w2) = C(w1,w2,w3) / C(w1,w2)
    numerator = trigram_counts[(w1,w2,w3)]
    denominator = bigram_counts[(w1,w2)]
    if denominator == 0:
        return 0.0
    return numerator / denominator

# Contoh
print("P(cats | i,love) =", get_trigram_prob("i","love","cats", bigram_counts, trigram_counts))
print("P(dogs | i,love) =", get_trigram_prob("i","love","dogs", bigram_counts, trigram_counts))


P(cats | i,love) = 0.3333333333333333
P(dogs | i,love) = 0.3333333333333333


In [None]:
def get_bigram_prob_smoothed(word_prev, word_curr, unigram_counts, bigram_counts, V):
    numerator = bigram_counts[(word_prev, word_curr)] + 1
    denominator = unigram_counts[word_prev] + V
    return numerator / denominator

V = len(unigram_counts)
print("P(pizza | love) dengan smoothing =", get_bigram_prob_smoothed('love', 'pizza', unigram_counts, bigram_counts, V))


P(pizza | love) dengan smoothing = 0.06666666666666667


soal 3 perbandingan dengan smoothing dan tanpa smoothing

In [None]:
def calculate_perplexity(sentence, unigram_counts, bigram_counts, V, smoothed=True):
    tokens = word_tokenize(sentence.lower())
    log_prob_sum = 0.0
    N = len(tokens)
    for i in range(1, N):
        prev_word = tokens[i-1]
        curr_word = tokens[i]
        if smoothed:
            prob = get_bigram_prob_smoothed(prev_word, curr_word, unigram_counts, bigram_counts, V)
        else:
            prob = get_bigram_prob(prev_word, curr_word, unigram_counts, bigram_counts)
        if prob > 0:
            log_prob_sum += math.log(prob, 2)
        else:
            log_prob_sum += math.log(1e-10, 2)  # Hindari log(0)
    perplexity = 2 ** (-log_prob_sum / N)
    return perplexity

# Uji coba
test_sentence = "I love cats"
pp_unsmoothed = calculate_perplexity(test_sentence, unigram_counts, bigram_counts, V, smoothed=False)
pp_smoothed = calculate_perplexity(test_sentence, unigram_counts, bigram_counts, V, smoothed=True)
print(f"Perplexity (tanpa smoothing): {pp_unsmoothed:.2f}")
print(f"Perplexity (dengan smoothing): {pp_smoothed:.2f}")


Perplexity (tanpa smoothing): 1.59
Perplexity (dengan smoothing): 3.11


soal 4 ubah nilai V

In [None]:
sentences = [
    "I love cats",
    "I love pizza",
    "I hate cats",
    "Random words xyz abc",
    "nama saya dale"
]
print("\nPerbandingan Perplexity:")
for sent in sentences:
    pp = calculate_perplexity(sent, unigram_counts, bigram_counts, V, smoothed=True)
    print(f"{sent:20} → {pp:.2f}")



Perbandingan Perplexity:
I love cats          → 3.11
I love pizza         → 3.91
I hate cats          → 4.70
Random words xyz abc → 6.45
nama saya dale       → 5.24


soal 5 implementasi Add-K

In [None]:
def get_bigram_prob_addk(w_prev, w_curr, unigram_counts, bigram_counts, V, k=0.5):
    numerator = bigram_counts[(w_prev, w_curr)] + k
    denominator = unigram_counts[w_prev] + k*V
    return numerator / denominator

print("P(pizza | love) dengan Add-k smoothing =", get_bigram_prob_addk('love','pizza', unigram_counts, bigram_counts, V, k=0.5))


P(pizza | love) dengan Add-k smoothing = 0.05555555555555555


In [None]:
min_freq = 2
vocab_frequent = {word for word, count in unigram_counts.items() if count >= min_freq}
print("Kosakata umum:", vocab_frequent)

def tokenize_with_unk(sentence, vocab):
    tokens = word_tokenize(sentence.lower())
    return [word if word in vocab else '<UNK>' for word in tokens]

sentence = "I love turtles and xyz"
tokens_unk = tokenize_with_unk(sentence, vocab_frequent)
print("Dengan <UNK>:", tokens_unk)


Kosakata umum: {'loves', 'dogs', 'cats', 'love', 'i'}
Dengan <UNK>: ['i', 'love', '<UNK>', '<UNK>', '<UNK>']


In [None]:
unigram_counts_unk = unigram_counts.copy()
bigram_counts_unk = bigram_counts.copy()

vocab_with_unk = list(vocab_frequent) + ['<UNK>']
V_unk = len(vocab_with_unk)

prob_unk = get_bigram_prob_smoothed('love', '<UNK>', unigram_counts_unk, bigram_counts_unk, V_unk)
print(f"P(<UNK> | love) = {prob_unk:.4f}")


P(<UNK> | love) = 0.1111


In [None]:
from nltk.stem import PorterStemmer
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = PorterStemmer()

# Stem seluruh tokens
tokens_stemmed = [stemmer.stem(token) for token in tokens]
print("Stemmed tokens:", tokens_stemmed)
# ['i', 'love', 'cat', 'i', 'love', 'dog', ...]


Stemmed tokens: ['i', 'love', 'cat', 'i', 'love', 'dog', 'i', 'hate', 'spider', 'she', 'love', 'cat', 'he', 'love', 'dog', 'i', 'love', 'natur', 'languag', 'process']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Unigram counts
unigram_counts = Counter(tokens)

# Bigram counts
bigram_counts = defaultdict(int)
for i in range(len(tokens) - 1):
    bigram = (tokens[i], tokens[i+1])
    bigram_counts[bigram] += 1

print("Unigram:", dict(unigram_counts))
print("Bigram:", dict(bigram_counts))


Unigram: {'i': 4, 'love': 3, 'cats': 2, 'dogs': 2, 'hate': 1, 'spiders': 1, 'she': 1, 'loves': 2, 'he': 1, 'natural': 1, 'language': 1, 'processing': 1}
Bigram: {('i', 'love'): 3, ('love', 'cats'): 1, ('cats', 'i'): 1, ('love', 'dogs'): 1, ('dogs', 'i'): 2, ('i', 'hate'): 1, ('hate', 'spiders'): 1, ('spiders', 'she'): 1, ('she', 'loves'): 1, ('loves', 'cats'): 1, ('cats', 'he'): 1, ('he', 'loves'): 1, ('loves', 'dogs'): 1, ('love', 'natural'): 1, ('natural', 'language'): 1, ('language', 'processing'): 1}
