# 1. Problem

#### *problem disini

# 2. Corpus Resource

#### *problem disini

# 3. Methods

#### *problem disini

# 4.Code

#### *problem disini

In [4]:
import pandas as pd
import re
import nltk
from nltk.util import ngrams
from nltk import pos_tag, word_tokenize
from collections import defaultdict, Counter
import string
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Load data
df = pd.read_csv("preprocessed_titles.csv")

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def clean_title(title):
    title = re.sub(r'\[.*?\]', '', title)  # remove tags like [HTML]
    title = title.lower().strip()
    return title

def is_valid_token(token):
    return token not in stop_words and token not in punctuation

df['clean_title'] = df.iloc[:, 0].apply(clean_title)
df['clean_meta'] = df.iloc[:, 2].astype(str).apply(clean_title)
df['tokens_pos'] = df['clean_title'].apply(lambda t: pos_tag(word_tokenize(t)))

# Build trigram model and vocabulary
ngram_freq = defaultdict(Counter)
word_vocab = Counter()

for tokens_pos in df['tokens_pos']:
    tokens = [word for word, pos in tokens_pos if is_valid_token(word)]
    for word in tokens:
        word_vocab[word] += 1
    for ngram in ngrams(tokens, 3):
        prefix = " ".join(ngram[:2])
        next_word = ngram[2]
        ngram_freq[prefix][next_word] += 1

# Suggest full word completion if query ends with a partial word
def word_completion(partial_word, max_suggestions=3):
    matches = [word for word in word_vocab if word.startswith(partial_word)]
    matches = sorted(matches, key=lambda w: word_vocab[w], reverse=True)
    return matches[:max_suggestions]

# Phrase autocomplete using trigram model
def suggest_autocomplete(query, max_suggestions=5):
    query = query.lower().strip()
    words = word_tokenize(query)

    if not words:
        return []

    last_word = words[-1]
    base = " ".join(words[:-1])

    suggestions = []

    # If last word is partial, complete it
    if not is_valid_token(last_word) or len(last_word) < 2:
        return []

    word_matches = word_completion(last_word, max_suggestions=1)
    if not word_matches:
        return []

    completed_word = word_matches[0]
    full_query = (base + ' ' + completed_word).strip()
    full_words = word_tokenize(full_query)

    # Now use this completed sequence for n-gram continuation
    if len(full_words) >= 2:
        prefix = " ".join(full_words[-2:])
    else:
        prefix = full_words[0]

    next_words = ngram_freq.get(prefix, Counter())

    # Build combined suggestions (completed word + next token)
    if next_words:
        for next_word, _ in next_words.most_common(max_suggestions - 1):
            suggestions.append(full_query + ' ' + next_word)

    suggestions.insert(0, full_query)

    return suggestions


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Christian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Christian\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Christian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
user_query = "machine learning"
suggestions = suggest_autocomplete(user_query)
print("Suggestions:", suggestions)

Suggestions: ['machine learning', 'machine learning python', 'machine learning trends', 'machine learning dynamical', 'machine learning art']


# 5. Performance Evaluation

### *pake apa

# 6. Conclusion and Future works

#### *problem disini