In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# Download NLTK resources (run this once)
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yufis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yufis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Contoh teks
text = """Natural Language Processing is fascinating. 
It's a combination of linguistics and computer science."""

In [12]:
# 1. Tokenisasi: Memecah teks menjadi kata-kata
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '.', 'It', "'s", 'a', 'combination', 'of', 'linguistics', 'and', 'computer', 'science', '.']


In [13]:
# 2. Stop Words Removal: Menghapus kata-kata umum yang tidak bermakna (seperti 'is', 'a', dll.)
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['Natural', 'Language', 'Processing', 'fascinating', '.', "'s", 'combination', 'linguistics', 'computer', 'science', '.']


In [14]:
# 3. Stemming: Mengubah kata menjadi bentuk dasar
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['natur', 'languag', 'process', 'fascin', '.', "'s", 'combin', 'linguist', 'comput', 'scienc', '.']


In [15]:
# 4. Bag of Words: Representasi teks dengan frekuensi kemunculan kata
corpus = ["Natural Language Processing is fascinating.",
          "It's a combination of linguistics and computer science.",
          "NLP involves text analysis, sentiment analysis, and much more."]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print("Bag of Words Representation:\n", X.toarray())
print("Vocabulary:\n", vectorizer.get_feature_names_out())

Bag of Words Representation:
 [[0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0]
 [0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0]
 [2 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1]]
Vocabulary:
 ['analysis' 'and' 'combination' 'computer' 'fascinating' 'involves' 'is'
 'it' 'language' 'linguistics' 'more' 'much' 'natural' 'nlp' 'of'
 'processing' 'science' 'sentiment' 'text']
