In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string
import nltk
from nltk.util import ngrams
from collections import Counter
import math
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [None]:
# A. Preprocess the text
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s.,;!?]', '', text.lower())
    return text

In [None]:
# B. Tokenization
def tokenize(text):
    return nltk.word_tokenize(text)

In [None]:
# C. Remove stop words
def remove_stopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

In [None]:
# D. Perform lemmatization
def perform_lemmatization(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
# E. Build n-gram model and calculate probabilities
def build_ngram_model(tokens, n):
    ngrams_list = ngrams(tokens, n)
    ngram_counts = Counter(ngrams_list)
    total_ngrams = sum(ngram_counts.values())
    ngram_probabilities = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
    return ngram_probabilities

In [None]:
# F. Simple probability for prediction
def simple_prob_predict_next_word(input_text, ngram_probabilities, n):
    tokens = preprocess_text(input_text)
    tokens = tokenize(tokens)
    tokens = remove_stopwords(tokens)
    tokens = perform_lemmatization(tokens)
    ngram_prefix = tuple(tokens[-(n-1):])

    predictions = []
    for ngram, prob in ngram_probabilities.items():
        if ngram[:n-1] == ngram_prefix:
            predictions.append((ngram[-1], prob))

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions

# G. Bayesian prediction
def bayesian_predict_next_word(input_text, ngram_probabilities, n):
    tokens = preprocess_text(input_text)
    tokens = tokenize(tokens)
    tokens = remove_stopwords(tokens)
    tokens = perform_lemmatization(tokens)
    ngram_prefix = tuple(tokens[-(n-1):])

    # Filter n-grams that match the prefix
    relevant_ngrams = {ngram: prob for ngram, prob in ngram_probabilities.items() if ngram[:n-1] == ngram_prefix}

    # Calculate the denominator for Bayesian probability
    denominator = sum(relevant_ngrams.values())

    # Calculate Bayesian probabilities
    predictions = [(ngram[-1], (prob + 1) / (denominator + len(ngram_probabilities))) for ngram, prob in relevant_ngrams.items()]

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions

In [None]:
# I. Fungsi untuk mendapatkan hasil berupa teks dengan kata-kata saja
def get_formatted_predictions(predictions, max_words):
    # Ambil hanya sejumlah maksimum kata yang diinginkan
    predictions = predictions[:max_words]

    # Ambil kata-kata saja
    formatted_predictions = [word for word, prob in predictions]

    # Gabungkan hasil menjadi kalimat
    result = ' '.join(formatted_predictions)
    return result

## **Buat di uji coba**

In [None]:
# Load and preprocess the text from the file
file_path = 'food.txt'
with open(file_path, 'r') as file:
    text_data = file.read()

# Preprocess the text and tokenize
preprocessed_text = preprocess_text(text_data)
tokens = tokenize(preprocessed_text)
tokens = remove_stopwords(tokens)
tokens = perform_lemmatization(tokens)

# Build n-gram model (here we use trigrams, n=3)
ngram_probabilities = build_ngram_model(tokens, n=3)

In [None]:
import pickle

# Simpan objek n-gram probabilitas ke dalam file menggunakan pickle
with open('ngram_probabilities.pkl', 'wb') as f:
    pickle.dump(ngram_probabilities, f)

In [None]:
# Input text for prediction
input_text = "my parent told me to eat"
# Set the maximum number of words to display in predictions
max_words = 5

# Perform predictions using simple probability
simple_prob_predictions = simple_prob_predict_next_word(input_text, ngram_probabilities, n=3)
simple_prob_output = get_formatted_predictions(simple_prob_predictions, max_words)

# Perform predictions using Bayesian approach
bayesian_predictions = bayesian_predict_next_word(input_text, ngram_probabilities, n=3)
bayesian_output = get_formatted_predictions(bayesian_predictions, max_words)

print("\nSimple Probability Predictions:")
print(input_text, simple_prob_output)

print("\nBayesian Predictions:")
print(input_text, bayesian_output)


Simple Probability Predictions:
my parent told me to eat one couple smaller tart gluten

Bayesian Predictions:
my parent told me to eat one couple smaller tart gluten


In [None]:
# Print the predictions
print("\nSimple Probability Predictions:")
print("\n".join([f"{word}: {prob}" for word, prob in simple_prob_predictions[:max_words]]))

print("\nBayesian Predictions:")
print("\n".join([f"{word}: {prob}" for word, prob in bayesian_predictions[:max_words]]))


Simple Probability Predictions:
one: 1.4487970493219774e-07
couple: 7.243985246609887e-08
smaller: 7.243985246609887e-08
tart: 7.243985246609887e-08
gluten: 7.243985246609887e-08

Bayesian Predictions:
one: 9.324113261848446e-08
couple: 9.324112586411156e-08
smaller: 9.324112586411156e-08
tart: 9.324112586411156e-08
gluten: 9.324112586411156e-08
