In [1]:
pip install transformers nltk spacy


Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reyri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reyri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import re

def clean_text(text):
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Menghapus tanda baca
    text = re.sub(r'[^\w\s]', '', text)
    # Menghapus karakter whitespace tambahan
    text = text.strip()
    # Mengubah menjadi huruf kecil
    text = text.lower()
    return text

# Contoh teks
text = "Hugging Face is amazing!!! NLP is evolving fast in 2025."
cleaned_text = clean_text(text)
print("Cleaned Text:", cleaned_text)


Cleaned Text: hugging face is amazing nlp is evolving fast in


In [7]:
import spacy

# Memuat model bahasa Inggris
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

text = "The quick brown foxes were jumping over the lazy dogs."
print("Lemmatized Text:", lemmatize_text(text))



Lemmatized Text: the quick brown fox be jump over the lazy dog .


In [8]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

stemmed_text = stem_text(cleaned_text)
print("Stemmed Text:", stemmed_text)


Stemmed Text: hug face is amaz nlp is evolv fast in


In [9]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

filtered_text = remove_stopwords(cleaned_text)
print("Filtered Text:", filtered_text)


Filtered Text: hugging face amazing nlp evolving fast


In [10]:
from transformers import AutoTokenizer

# Memuat tokenizer pre-trained
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Teks input
texts = ["Hugging Face is amazing!", "This is an example of a very long text that exceeds the model's maximum length."]

# Tokenisasi dengan padding dan truncation
tokens = tokenizer(
    texts,
    padding="max_length",  # Menambahkan padding hingga max_length
    truncation=True,       # Memotong teks jika terlalu panjang
    max_length=10,         # Panjang maksimum token
    return_tensors="pt"    # Mengembalikan tensor PyTorch
)

print(tokens)


{'input_ids': tensor([[  101, 17662,  2227,  2003,  6429,   999,   102,     0,     0,     0],
        [  101,  2023,  2003,  2019,  2742,  1997,  1037,  2200,  2146,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [11]:
def preprocess_text(text, tokenizer, max_length=10):
    # Cleaning
    text = clean_text(text)
    # Lemmatization
    text = lemmatize_text(text)
    # Remove Stopwords
    text = remove_stopwords(text)
    # Tokenization
    tokens = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return tokens

# Contoh teks
text = "Hugging Face makes NLP easy and fun in 2025!"

# Preprocessing lengkap
processed_tokens = preprocess_text(text, tokenizer)
print(processed_tokens)


{'input_ids': tensor([[  101,  8549,  2227,  2191, 17953,  2361,  3733,  4569,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}
