<a href="https://colab.research.google.com/github/Daalleee/Natural-Language-Processing-NLP-/blob/main/Pertemuan_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# NLP Feature Engineering: One-Hot, BoW, TF-IDF + Kata Informatif
from collections import Counter
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Dataset
corpus = [
    "Machine learning adalah cabang dari kecerdasan buatan",
    "Deep learning merupakan bagian dari machine learning",
    "Kecerdasan buatan berkembang sangat cepat",
    "Machine learning digunakan dalam berbagai aplikasi kecerdasan buatan",
    "AI dan kecerdasan buatan semakin populer",
    "Deep learning digunakan untuk computer vision dan NLP",
    "Machine learning sangat penting dalam analisis data"
]
corpus = [doc.lower() for doc in corpus]
tokenized = [doc.split() for doc in corpus]
vocab = sorted(set(word for doc in tokenized for word in doc))

In [None]:
# One-Hot
one_hot = [[1 if w in doc else 0 for w in vocab] for doc in tokenized]
print("\n=== One-Hot ===")
print(pd.DataFrame(one_hot, columns=vocab))


=== One-Hot ===
   adalah  ai  analisis  aplikasi  bagian  berbagai  berkembang  buatan  \
0       1   0         0         0       0         0           0       1   
1       0   0         0         0       1         0           0       0   
2       0   0         0         0       0         0           1       1   
3       0   0         0         1       0         1           0       1   
4       0   1         0         0       0         0           0       1   
5       0   0         0         0       0         0           0       0   
6       0   0         1         0       0         0           0       0   

   cabang  cepat  ...  learning  machine  merupakan  nlp  penting  populer  \
0       1      0  ...         1        1          0    0        0        0   
1       0      0  ...         1        1          1    0        0        0   
2       0      1  ...         0        0          0    0        0        0   
3       0      0  ...         1        1          0    0        0     

In [None]:
# Bag of Words
bow = []
for doc in tokenized:
    counts = Counter(doc)
    bow.append([counts.get(w, 0) for w in vocab])
print("\n=== Bag of Words ===")
print(pd.DataFrame(bow, columns=vocab))


=== Bag of Words ===
   adalah  ai  analisis  aplikasi  bagian  berbagai  berkembang  buatan  \
0       1   0         0         0       0         0           0       1   
1       0   0         0         0       1         0           0       0   
2       0   0         0         0       0         0           1       1   
3       0   0         0         1       0         1           0       1   
4       0   1         0         0       0         0           0       1   
5       0   0         0         0       0         0           0       0   
6       0   0         1         0       0         0           0       0   

   cabang  cepat  ...  learning  machine  merupakan  nlp  penting  populer  \
0       1      0  ...         1        1          0    0        0        0   
1       0      0  ...         2        1          1    0        0        0   
2       0      1  ...         0        0          0    0        0        0   
3       0      0  ...         1        1          0    0        0

In [None]:
# TF-IDF manual
N = len(corpus)
df_counts = {w: sum(1 for doc in tokenized if w in doc) for w in vocab}
idf = {w: math.log(N / df_counts[w]) for w in vocab}
tfidf_manual = []
for doc in tokenized:
    counts = Counter(doc)
    total = len(doc)
    tfidf_manual.append({w: (counts.get(w, 0)/total)*idf[w] for w in vocab})
print("\n=== TF-IDF Manual ===")
print(pd.DataFrame(tfidf_manual))


=== TF-IDF Manual ===
     adalah        ai  analisis  aplikasi    bagian  berbagai  berkembang  \
0  0.277987  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.277987  0.000000    0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.389182   
3  0.000000  0.000000  0.000000  0.243239  0.000000  0.243239    0.000000   
4  0.000000  0.324318  0.000000  0.000000  0.000000  0.000000    0.000000   
5  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   
6  0.000000  0.000000  0.277987  0.000000  0.000000  0.000000    0.000000   

     buatan    cabang     cepat  ...  learning   machine  merupakan       nlp  \
0  0.079945  0.277987  0.000000  ...  0.048067  0.079945   0.000000  0.000000   
1  0.000000  0.000000  0.000000  ...  0.096135  0.079945   0.277987  0.000000   
2  0.111923  0.000000  0.389182  ...  0.000000  0.000000   0.000000  0.000000   
3  0.069952  0.000000  0.000000  ...

In [None]:
# TF-IDF dengan library
vec = TfidfVectorizer()
X = vec.fit_transform(corpus)
print("\n=== TF-IDF (sklearn) ===")
print(pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out()))


=== TF-IDF (sklearn) ===
     adalah        ai  analisis  aplikasi    bagian  berbagai  berkembang  \
0  0.492744  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.450729  0.000000    0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.538538   
3  0.000000  0.000000  0.000000  0.456069  0.000000  0.456069    0.000000   
4  0.000000  0.474152  0.000000  0.000000  0.000000  0.000000    0.000000   
5  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   
6  0.000000  0.000000  0.445049  0.000000  0.000000  0.000000    0.000000   

     buatan    cabang     cepat  ...  learning   machine  merupakan       nlp  \
0  0.303540  0.492744  0.000000  ...  0.265893  0.303540   0.000000  0.000000   
1  0.000000  0.000000  0.000000  ...  0.486441  0.277658   0.450729  0.000000   
2  0.331750  0.000000  0.538538  ...  0.000000  0.000000   0.000000  0.000000   
3  0.280948  0.000000  0.000000  

In [None]:
# Kata paling informatif per dokumen (top 5)
print("\n=== Kata paling informatif per dokumen ===")
df_lib = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
for i, row in df_lib.iterrows():
    top = row.sort_values(ascending=False).head(5)
    print(f"Dokumen {i+1}: {list(zip(top.index, top.values))}")


=== Kata paling informatif per dokumen ===
Dokumen 1: [('adalah', np.float64(0.4927443298951514)), ('cabang', np.float64(0.4927443298951514)), ('dari', np.float64(0.40902011034963787)), ('buatan', np.float64(0.30354006824883034)), ('kecerdasan', np.float64(0.30354006824883034))]
Dokumen 2: [('learning', np.float64(0.486441183247248)), ('bagian', np.float64(0.4507292123704172)), ('merupakan', np.float64(0.4507292123704172)), ('deep', np.float64(0.3741439545753512)), ('dari', np.float64(0.3741439545753512))]
Dokumen 3: [('cepat', np.float64(0.5385378369314237)), ('berkembang', np.float64(0.5385378369314237)), ('sangat', np.float64(0.4470326539039362)), ('buatan', np.float64(0.3317497571438829)), ('kecerdasan', np.float64(0.3317497571438829))]
Dokumen 4: [('aplikasi', np.float64(0.45606932936755257)), ('berbagai', np.float64(0.45606932936755257)), ('dalam', np.float64(0.3785767102884673)), ('digunakan', np.float64(0.3785767102884673)), ('kecerdasan', np.float64(0.280947556295335))]
Dokum

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

docs = [
    "Machine learning adalah cabang dari kecerdasan buatan",
    "Deep learning merupakan bagian dari machine learning",
    "Kecerdasan buatan berkembang sangat cepat",
    "Machine learning digunakan dalam berbagai aplikasi kecerdasan buatan",
    "AI dan kecerdasan buatan semakin populer",
    "Deep learning digunakan untuk computer vision dan NLP",
    "Machine learning sangat penting dalam analisis data"
]




In [None]:
# --- One-hot encoding (binary presence) ---
vectorizer_oh = CountVectorizer(binary=True)
X_oh = vectorizer_oh.fit_transform(docs)  # bentuk sparse matrix
print("Vocabulary (One-hot):", vectorizer_oh.get_feature_names_out())
print("Matrix One-hot (dense):\n", X_oh.toarray())



Vocabulary (One-hot): ['adalah' 'ai' 'analisis' 'aplikasi' 'bagian' 'berbagai' 'berkembang'
 'buatan' 'cabang' 'cepat' 'computer' 'dalam' 'dan' 'dari' 'data' 'deep'
 'digunakan' 'kecerdasan' 'learning' 'machine' 'merupakan' 'nlp' 'penting'
 'populer' 'sangat' 'semakin' 'untuk' 'vision']
Matrix One-hot (dense):
 [[1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 0]]


In [None]:
# --- Bag-of-Words (Count) ---
vectorizer_bow = CountVectorizer(binary=False)
X_bow = vectorizer_bow.fit_transform(docs)
print("Vocabulary (BoW):", vectorizer_bow.get_feature_names_out())
print("Matrix BoW (dense):\n", X_bow.toarray())



Vocabulary (BoW): ['adalah' 'ai' 'analisis' 'aplikasi' 'bagian' 'berbagai' 'berkembang'
 'buatan' 'cabang' 'cepat' 'computer' 'dalam' 'dan' 'dari' 'data' 'deep'
 'digunakan' 'kecerdasan' 'learning' 'machine' 'merupakan' 'nlp' 'penting'
 'populer' 'sangat' 'semakin' 'untuk' 'vision']
Matrix BoW (dense):
 [[1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 2 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 0]]


In [None]:
# --- TF-IDF ---
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(docs)
print("Vocabulary (TF-IDF):", vectorizer_tfidf.get_feature_names_out())
print("Matrix TF-IDF:\n", X_tfidf.toarray())

Vocabulary (TF-IDF): ['adalah' 'ai' 'analisis' 'aplikasi' 'bagian' 'berbagai' 'berkembang'
 'buatan' 'cabang' 'cepat' 'computer' 'dalam' 'dan' 'dari' 'data' 'deep'
 'digunakan' 'kecerdasan' 'learning' 'machine' 'merupakan' 'nlp' 'penting'
 'populer' 'sangat' 'semakin' 'untuk' 'vision']
Matrix TF-IDF:
 [[0.49274433 0.         0.         0.         0.         0.
  0.         0.30354007 0.49274433 0.         0.         0.
  0.         0.40902011 0.         0.         0.         0.30354007
  0.26589261 0.30354007 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.45072921 0.
  0.         0.         0.         0.         0.         0.
  0.         0.37414395 0.         0.37414395 0.         0.
  0.48644118 0.27765794 0.45072921 0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.53853784 0.33174976 0.         0.53853784 0.       

In [None]:
feature_names = vectorizer_tfidf.get_feature_names_out()
tfidf_dense = X_tfidf.toarray()

for doc_idx, row in enumerate(tfidf_dense):
    print(f"\nDokumen {doc_idx+1}:")
    # ambil 3 kata dengan bobot terbesar
    top_n = 3
    top_indices = np.argsort(row)[::-1][:top_n]
    for idx in top_indices:
        print(f"  {feature_names[idx]}: {row[idx]:.4f}")



Dokumen 1:
  adalah: 0.4927
  cabang: 0.4927
  dari: 0.4090

Dokumen 2:
  learning: 0.4864
  bagian: 0.4507
  merupakan: 0.4507

Dokumen 3:
  berkembang: 0.5385
  cepat: 0.5385
  sangat: 0.4470

Dokumen 4:
  aplikasi: 0.4561
  berbagai: 0.4561
  dalam: 0.3786

Dokumen 5:
  semakin: 0.4742
  ai: 0.4742
  populer: 0.4742

Dokumen 6:
  vision: 0.3966
  untuk: 0.3966
  nlp: 0.3966

Dokumen 7:
  penting: 0.4450
  data: 0.4450
  analisis: 0.4450
