### Library

In [1]:
pip install sastrawi



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# Download the 'punkt' tokenizer data if not already downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Download 'punkt_tab' as well, as it's often needed by word_tokenize
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

In [4]:
# Download NLTK stopwords if not already downloaded (optional, but good practice for broader NLP tasks)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

In [5]:
df = pd.read_csv('hasil_label_manual.csv') # memanggil dataset
df.drop('Unnamed: 0', axis=1, inplace=True) # Mengapus kolom yang kosong karena tidak diperlukan
df # menampilkan dataset

Unnamed: 0,text_cleaned,label
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral
4,passwd admin1234,netral
...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral
497,terus yg di tangkep siapa,netral
498,terus yang ketangkep siapa bjirrrka,netral


### Preprocessing




#### Handle Missing value





In [6]:
df.isnull().sum() # Mengecek apakah ada missing value

Unnamed: 0,0
text_cleaned,4
label,0


In [7]:
df.fillna(0, inplace=True) # Mengisi missing value dengan 0

#### Kamus

In [8]:
normalize_kamus = {
    # Kata umum yang sudah ada
    'aja': 'saja',
    'gitu': 'begitu',
    'kek': 'seperti',
    'klo': 'kalau',
    'klu': 'kalau',
    'kalo': 'kalau',
    'yg': 'yang',
    'tu': 'itu',
    'tdur': 'tidur',
    'nangkep': 'menangkap',
    'nangkip': 'menangkap',
    'nangkepnya': 'menangkapnya',
    'mrka': 'mereka',
    'gaib': 'ghaib',
    'qt': 'kita',
    'boro2': 'jangankan',
    'g': 'tidak',
    'ga': 'tidak',
    'gak': 'tidak',
    'ngakak': 'tertawa',
    'pdhl': 'padahal',
    'bgt': 'banget',
    'bnyk': 'banyak',
    'gk': 'tidak',
    'udh': 'sudah',
    'lg': 'lagi',
    'd': 'di',
    'jd': 'jadi',
    'tbtb': 'tiba-tiba',
    'gtu':'gitu',
    # Slang/typo khusus dari data
    'pulisic': 'polisi',
    'pulisicnya': 'polisinya',
    'polisi': 'polisi',
    'bjirrr': 'bjorka',
    'bjirka': 'bjorka',
    'om': 'paman',
    'deddy': 'Dedi',
    'wa': 'whatsaapp',
    'twitt':'cuitan',
    'yah':'ya',
    'biorka': 'bjorka',
    'konohahah': 'konoha',
    'kudu':'harus',
    'tuh':'itu',
    'pahlawn':'pahlawan',


    # --- Tambahan Kata Umum & Singkatan ---
    'krn': 'karena',
    'dr': 'dari',
    'jd': 'jadi',
    'trlalu': 'terlalu',
    'knp': 'kenapa',
    'sampe': 'sampai',
    'smp': 'sampai',
    'msh': 'masih',
    'trs': 'terus',
    'dlm': 'dalam',
    'bkin': 'membuat',
    'skrg': 'sekarang',
    'tmn': 'teman',
    'bgs': 'bagus',
    'kpn': 'kapan',
    'dmn': 'di mana',
    'bs': 'bisa',
    'spt': 'seperti',
    'mksd': 'maksud',
    'bullyan': 'bully',
    'cmn': 'cuma',
    'cm': 'cuma',
    'omm': 'om',
    'tau': 'tahu',
    'mending': 'lebih baik',
    'bgtu': 'begitu',
    'klo': 'kalau',
    'sy': 'saya',
    'km': 'kamu',
    'hrs': 'harus',
    'mngkn': 'mungkin',
    'ngga': 'tidak',
    'kdg': 'kadang',
    'bkn': 'bukan',
    'gmn': 'bagaimana',
    'gmana': 'bagaimana',
    'wkwk': 'tertawa', # Kata ekspresi
    'wkwkwk': 'tertawa',
    'kwkwkwk': 'tertawa',
    'hahaha': 'tertawa',
    '	hihihihihihi' : 'tertawa',
    'haha': 'tertawa',
    'btw': 'ngomong-ngomong',
    'thx': 'terima kasih',
    'woy':'hei',
    'k': 'ke',
    'pake': 'pakai',
    'kyk': 'seperti',
    'dl': 'dulu',
    'dlu': 'dulu',
    'baca': 'membaca',
    'ngerti': 'mengerti',
    'bknnya': 'bukannya',
    'blg': 'bilang',
    'tllu': 'terlalu',
    'sdh': 'sudah',
    'sbg': 'sebagai',
    'trjdi': 'terjadi',
    'tggu': 'tunggu',
    'sbnrnya': 'sebenarnya',
    'tp': 'tapi',
    'pkoknya': 'pokoknya',
    'gw':'aku',
    'sihhh':'ya',
    'ngundang':'undang',
    'thun':'tahun',
    'ajah':'saja',
    'dengerin': 'dengar'
}

#### Data Cleaning

In [9]:
dm = df
dm

Unnamed: 0,text_cleaned,label
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral
4,passwd admin1234,netral
...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral
497,terus yg di tangkep siapa,netral
498,terus yang ketangkep siapa bjirrrka,netral


In [10]:
def normalize_word(text):
    return ' '.join([normalize_kamus[word] if word in normalize_kamus else word for word in str(text).split()])

dm['normalize_word']= df['text_cleaned'].apply(normalize_word)
dm

Unnamed: 0,text_cleaned,label,normalize_word
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya ini masa juga tidak bisa mendeteksi bj...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,seperti udah tenang gitu kan tidak viral si bj...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya dua2nya akan lebih dini...
4,passwd admin1234,netral,passwd admin1234
...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan yang b...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...
497,terus yg di tangkep siapa,netral,terus yang di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus yang ketangkep siapa bjirrrka


In [11]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

dm['normalize_word'] = dm['normalize_word'].apply(remove_punctuation)
dm

Unnamed: 0,text_cleaned,label,normalize_word
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya ini masa juga tidak bisa mendeteksi bj...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,seperti udah tenang gitu kan tidak viral si bj...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya dua2nya akan lebih dini...
4,passwd admin1234,netral,passwd admin1234
...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan yang b...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...
497,terus yg di tangkep siapa,netral,terus yang di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus yang ketangkep siapa bjirrrka


In [12]:
def remove_numbers_special_chars(text):
    # Keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    return text

dm['normalize_word'] = dm['normalize_word'].apply(remove_numbers_special_chars)
dm

Unnamed: 0,text_cleaned,label,normalize_word
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya ini masa juga tidak bisa mendeteksi bj...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,seperti udah tenang gitu kan tidak viral si bj...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya duanya akan lebih dinik...
4,passwd admin1234,netral,passwd admin
...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan yang b...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...
497,terus yg di tangkep siapa,netral,terus yang di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus yang ketangkep siapa bjirrrka


In [13]:
def normalize_whitespace(text):
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

dm['normalize_word'] = dm['normalize_word'].apply(normalize_whitespace)
dm

Unnamed: 0,text_cleaned,label,normalize_word
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya ini masa juga tidak bisa mendeteksi bj...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,seperti udah tenang gitu kan tidak viral si bj...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya duanya akan lebih dinik...
4,passwd admin1234,netral,passwd admin
...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan yang b...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...
497,terus yg di tangkep siapa,netral,terus yang di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus yang ketangkep siapa bjirrrka


In [14]:
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()

def remove_stopwords(text):
    return stopword_remover.remove(str(text))

dm['normalize_word'] = dm['normalize_word'].apply(remove_stopwords)
dm

Unnamed: 0,text_cleaned,label,normalize_word
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya masa tidak mendeteksi bjorkabantu lah ...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,udah tenang gitu kan viral si bjorka ehh tibat...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya duanya lebih dinikmati
4,passwd admin1234,netral,passwd admin
...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan bikin ...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...
497,terus yg di tangkep siapa,netral,terus di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus ketangkep siapa bjirrrka


In [15]:
def tokenize_text(text):
    return word_tokenize(str(text))

dm['tokens'] = dm['normalize_word'].apply(tokenize_text)
dm

Unnamed: 0,text_cleaned,label,normalize_word,tokens
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...,"[menangkap, bjorka, tertawa, guling, kalau, ma..."
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya masa tidak mendeteksi bjorkabantu lah ...,"[mas, nya, masa, tidak, mendeteksi, bjorkabant..."
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,udah tenang gitu kan viral si bjorka ehh tibat...,"[udah, tenang, gitu, kan, viral, si, bjorka, e..."
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya duanya lebih dinikmati,"[kalau, ngobrol, shootnya, duanya, lebih, dini..."
4,passwd admin1234,netral,passwd admin,"[passwd, admin]"
...,...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan bikin ...,"[sixtyshop, software, nya, orang, kalimantan, ..."
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...,"[aku, sering, dimasukin, grup, whatsaapp, dika..."
497,terus yg di tangkep siapa,netral,terus di tangkep siapa,"[terus, di, tangkep, siapa]"
498,terus yang ketangkep siapa bjirrrka,netral,terus ketangkep siapa bjirrrka,"[terus, ketangkep, siapa, bjirrrka]"


#### Stemming



In [16]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory_stem = StemmerFactory()
stemmer = factory_stem.create_stemmer()

def stem_tokens(tokens):
    if isinstance(tokens, list):
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)
    return str(tokens) # Handle cases where it might not be a list

dm['stemmed_text'] = dm['tokens'].apply(stem_tokens)
dm

Unnamed: 0,text_cleaned,label,normalize_word,tokens,stemmed_text
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...,"[menangkap, bjorka, tertawa, guling, kalau, ma...",tangkap bjorka tertawa guling kalau maling aya...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya masa tidak mendeteksi bjorkabantu lah ...,"[mas, nya, masa, tidak, mendeteksi, bjorkabant...",mas nya masa tidak deteksi bjorkabantu lah pol...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,udah tenang gitu kan viral si bjorka ehh tibat...,"[udah, tenang, gitu, kan, viral, si, bjorka, e...",udah tenang gitu kan viral si bjorka ehh tibat...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya duanya lebih dinikmati,"[kalau, ngobrol, shootnya, duanya, lebih, dini...",kalau ngobrol shootnya dua lebih nikmat
4,passwd admin1234,netral,passwd admin,"[passwd, admin]",passwd admin
...,...,...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan bikin ...,"[sixtyshop, software, nya, orang, kalimantan, ...",sixtyshop software nya orang kalimantan bikin ...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...,"[aku, sering, dimasukin, grup, whatsaapp, dika...",aku sering dimasukin grup whatsaapp kasih tuga...
497,terus yg di tangkep siapa,netral,terus di tangkep siapa,"[terus, di, tangkep, siapa]",terus di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus ketangkep siapa bjirrrka,"[terus, ketangkep, siapa, bjirrrka]",terus ketangkep siapa bjirrrka


#### Lemmatization


In [17]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Re-create stemmer factory and stemmer (though already available, good for isolated execution)
factory_lem = StemmerFactory()
stemmer_lem = factory_lem.create_stemmer()

def lemmatize_text_proxy(text):
    return stemmer_lem.stem(str(text))

dm['lemmatize_text'] = dm['stemmed_text'].apply(lemmatize_text_proxy)
dm

Unnamed: 0,text_cleaned,label,normalize_word,tokens,stemmed_text,lemmatize_text
0,nangkep bjorka ngakak guling kalo maling ayam ...,negatif,menangkap bjorka tertawa guling kalau maling a...,"[menangkap, bjorka, tertawa, guling, kalau, ma...",tangkap bjorka tertawa guling kalau maling aya...,tangkap bjorka tertawa guling kalau maling aya...
1,mas nya ini masa juga g bisa mendeteksi bjorka...,negatif,mas nya masa tidak mendeteksi bjorkabantu lah ...,"[mas, nya, masa, tidak, mendeteksi, bjorkabant...",mas nya masa tidak deteksi bjorkabantu lah pol...,mas nya masa tidak deteksi bjorkabantu lah pol...
2,kek udah tenang gtu kan gak viral si bjorka eh...,negatif,udah tenang gitu kan viral si bjorka ehh tibat...,"[udah, tenang, gitu, kan, viral, si, bjorka, e...",udah tenang gitu kan viral si bjorka ehh tibat...,udah tenang gitu kan viral si bjorka ehh tibat...
3,kalo ngobrol shootnya dua2nya akan lebih dinik...,netral,kalau ngobrol shootnya duanya lebih dinikmati,"[kalau, ngobrol, shootnya, duanya, lebih, dini...",kalau ngobrol shootnya dua lebih nikmat,kalau ngobrol shootnya dua lebih nikmat
4,passwd admin1234,netral,passwd admin,"[passwd, admin]",passwd admin,passwd admin
...,...,...,...,...,...,...
495,sixtyshop software nya orang kalimantan yang b...,netral,sixtyshop software nya orang kalimantan bikin ...,"[sixtyshop, software, nya, orang, kalimantan, ...",sixtyshop software nya orang kalimantan bikin ...,sixtyshop software nya orang kalimantan bikin ...
496,gw sering dimasukin grup wa dikasih tugas bebe...,netral,aku sering dimasukin grup whatsaapp dikasih tu...,"[aku, sering, dimasukin, grup, whatsaapp, dika...",aku sering dimasukin grup whatsaapp kasih tuga...,aku sering dimasukin grup whatsaapp kasih tuga...
497,terus yg di tangkep siapa,netral,terus di tangkep siapa,"[terus, di, tangkep, siapa]",terus di tangkep siapa,terus di tangkep siapa
498,terus yang ketangkep siapa bjirrrka,netral,terus ketangkep siapa bjirrrka,"[terus, ketangkep, siapa, bjirrrka]",terus ketangkep siapa bjirrrka,terus ketangkep siapa bjirrrka


### Ekstraksi Fitur

#### TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd # Import pandas to create DataFrame

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(dm['stemmed_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("First 5 TF-IDF features for the first document:\n", tfidf_matrix[0, :5].toarray())

# Convert sparse matrix to a DataFrame and then save to CSV
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv('tfidf_matrix.csv', index=False)
print("TF-IDF matrix saved to tfidf_matrix.csv")

TF-IDF matrix shape: (500, 1881)
First 5 TF-IDF features for the first document:
 [[0. 0. 0. 0. 0.]]
TF-IDF matrix saved to tfidf_matrix.csv


#### Word2Vec Model

In [22]:
pip install gensim



In [23]:
dm['stemmed_tokens_w2v'] = dm['stemmed_text'].apply(lambda x: word_tokenize(str(x)))
dm[['stemmed_text', 'stemmed_tokens_w2v']].head()

Unnamed: 0,stemmed_text,stemmed_tokens_w2v
0,tangkap bjorka tertawa guling kalau maling aya...,"[tangkap, bjorka, tertawa, guling, kalau, mali..."
1,mas nya masa tidak deteksi bjorkabantu lah pol...,"[mas, nya, masa, tidak, deteksi, bjorkabantu, ..."
2,udah tenang gitu kan viral si bjorka ehh tibat...,"[udah, tenang, gitu, kan, viral, si, bjorka, e..."
3,kalau ngobrol shootnya dua lebih nikmat,"[kalau, ngobrol, shootnya, dua, lebih, nikmat]"
4,passwd admin,"[passwd, admin]"


In [24]:
from gensim.models import Word2Vec

print("Imported Word2Vec from gensim.models.")

# Initialize and train the Word2Vec model
word2vec_model = Word2Vec(
    sentences=dm['stemmed_tokens_w2v'],
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Train the model
word2vec_model.train(dm['stemmed_tokens_w2v'], total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

# Print the total number of words in the vocabulary
print(f"Total words in vocabulary: {len(word2vec_model.wv)}")

Imported Word2Vec from gensim.models.




Total words in vocabulary: 1889


In [25]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

# Create a new column 'word2vec_features' by applying the document_vector function
dm['word2vec_features'] = dm['stemmed_tokens_w2v'].apply(lambda x: document_vector(word2vec_model, x))
dm['word2vec_features']

Unnamed: 0,word2vec_features
0,"[-0.0024194955, 0.0063736364, 0.00011366063, -..."
1,"[-0.0039547505, 0.0076012327, -0.002420113, 0...."
2,"[-0.0040503885, 0.0029431148, -0.00034033618, ..."
3,"[-0.0040360778, 0.008622451, -0.0017073099, -0..."
4,"[-0.006654363, 0.005255996, 0.0060140947, 0.00..."
...,...
495,"[-0.0021828744, 0.0039618528, 0.00017811632, 0..."
496,"[-0.0022193724, 0.0053132204, -0.0011383403, 0..."
497,"[-0.0011912032, 0.0077595785, 0.0022410026, -0..."
498,"[-0.0007576245, 0.0010853965, 0.0011695905, -0..."


In [26]:
dm['word2vec_features'].to_csv('word2vec_features.csv', index=False)

#### Bag-of-Words (BoW)

Generate Bag-of-Words representations for the `stemmed_text` column.


In [27]:
bow_vectorizer = CountVectorizer()

# Fit and transform the 'stemmed_text' column
bow_matrix = bow_vectorizer.fit_transform(dm['stemmed_text'])

print("Bag-of-Words matrix shape:", bow_matrix.shape)
print("First 5 Bag-of-Words features for the first document:\n", bow_matrix.toarray())
bow_matrix

Bag-of-Words matrix shape: (500, 1881)
First 5 Bag-of-Words features for the first document:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4690 stored elements and shape (500, 1881)>

In [29]:
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
bow_df.to_csv('bow_matrix.csv', index=False)
print("Bag-of-Words matrix saved to bow_matrix.csv")

Bag-of-Words matrix saved to bow_matrix.csv


#### GloVe

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

**Reasoning**:
The GloVe zip file has been downloaded. The next step is to unzip the file to access the embedding text files, as specified in the instructions. I will then list the files to verify which one to use.



In [None]:
!unzip -o glove.6B.zip
!ls -lh glove.6B/

In [30]:
glove_embeddings = {}
file_path = 'glove.6B.100d.txt' # Using 100-dimensional embeddings as suggested

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=np.float32)
        glove_embeddings[word] = vector

print(f"Number of GloVe word vectors loaded: {len(glove_embeddings)}")

Number of GloVe word vectors loaded: 400000


In [31]:
def document_vector_glove(tokens, embeddings_dict, vector_dim=100):
    doc_vectors = []
    for token in tokens:
        if token in embeddings_dict:
            doc_vectors.append(embeddings_dict[token])
        else:
            # Assign a zero vector for out-of-vocabulary words
            doc_vectors.append(np.zeros(vector_dim))

    if not doc_vectors: # Handle empty documents or documents with only OOV words after filtering
        return np.zeros(vector_dim)

    # Calculate the average of all word vectors in the document
    return np.mean(doc_vectors, axis=0)

# Apply the function to create the 'glove_features' column
dm['glove_features'] = dm['stemmed_tokens_w2v'].apply(lambda x: document_vector_glove(x, glove_embeddings))

# Display the first few entries of the new column
dm['glove_features'].head()

Unnamed: 0,glove_features
0,"[-0.01308400183916092, -0.09297707636973687, -..."
1,"[-0.2774472839453004, -0.22975345328450203, -0..."
2,"[-0.057258701464161274, -0.10559450145810842, ..."
3,"[-0.08345600217580795, -0.1399866690238317, 0...."
4,"[-0.21769000589847565, 0.08867499977350235, 0...."


In [32]:
dm['glove_features'].to_csv('glove_features.csv', index=False)

### Modeling

#### Naive Baiyes

In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

print("Imported LabelEncoder and train_test_split.")

Imported LabelEncoder and train_test_split.


In [34]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(dm['label'])

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (400, 1881)
Shape of X_test: (100, 1881)
Shape of y_train: (400,)
Shape of y_test: (100,)


In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("Imported MultinomialNB and evaluation metrics.")

Imported MultinomialNB and evaluation metrics.


In [36]:
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred_nb, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred_nb, average='weighted', zero_division=0)

print(f"Naive Bayes Model Accuracy: {accuracy:.4f}")
print(f"Naive Bayes Model Precision: {precision:.4f}")
print(f"Naive Bayes Model Recall: {recall:.4f}")
print(f"Naive Bayes Model F1-Score: {f1:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_, zero_division=0))

Naive Bayes Model Accuracy: 0.7400
Naive Bayes Model Precision: 0.7274
Naive Bayes Model Recall: 0.7400
Naive Bayes Model F1-Score: 0.7294
Classification Report:
               precision    recall  f1-score   support

     negatif       0.81      0.72      0.76        53
      netral       0.68      0.82      0.74        44
     positif       0.00      0.00      0.00         3

    accuracy                           0.74       100
   macro avg       0.50      0.51      0.50       100
weighted avg       0.73      0.74      0.73       100



#### Support Vector Machine (SVM) Modeling


In [37]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Instantiate the SVC model
model_svm = SVC(kernel='linear', C=1.0, random_state=42) # Using linear kernel as a common starting point

# Fit the SVM model to the training data
model_svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svm = model_svm.predict(X_test)

# Calculate evaluation metrics
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted', zero_division=0)
recall_svm = recall_score(y_test, y_pred_svm, average='weighted', zero_division=0)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted', zero_division=0)

print(f"SVM Model Accuracy: {accuracy_svm:.4f}")
print(f"SVM Model Precision: {precision_svm:.4f}")
print(f"SVM Model Recall: {recall_svm:.4f}")
print(f"SVM Model F1-Score: {f1_svm:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_, zero_division=0))

SVM Model Accuracy: 0.6900
SVM Model Precision: 0.7183
SVM Model Recall: 0.6900
SVM Model F1-Score: 0.6762
Classification Report:
               precision    recall  f1-score   support

     negatif       0.86      0.57      0.68        53
      netral       0.60      0.89      0.72        44
     positif       0.00      0.00      0.00         3

    accuracy                           0.69       100
   macro avg       0.49      0.48      0.47       100
weighted avg       0.72      0.69      0.68       100



In [38]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Instantiate the SVC model
model_svm = SVC(kernel='linear', C=1.0, random_state=42) # Using linear kernel as a common starting point

# Fit the SVM model to the training data
model_svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svm = model_svm.predict(X_test)

# Calculate evaluation metrics
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted', zero_division=0)
recall_svm = recall_score(y_test, y_pred_svm, average='weighted', zero_division=0)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted', zero_division=0)

# Print the evaluation metrics
print(f"SVM Model Accuracy: {accuracy_svm:.4f}")
print(f"SVM Model Precision: {precision_svm:.4f}")
print(f"SVM Model Recall: {recall_svm:.4f}")
print(f"SVM Model F1-Score: {f1_svm:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_, zero_division=0))

SVM Model Accuracy: 0.6900
SVM Model Precision: 0.7183
SVM Model Recall: 0.6900
SVM Model F1-Score: 0.6762
Classification Report:
               precision    recall  f1-score   support

     negatif       0.86      0.57      0.68        53
      netral       0.60      0.89      0.72        44
     positif       0.00      0.00      0.00         3

    accuracy                           0.69       100
   macro avg       0.49      0.48      0.47       100
weighted avg       0.72      0.69      0.68       100



#### Logistic Regression Modeling


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Instantiate the Logistic Regression model
model_lr = LogisticRegression(max_iter=1000, random_state=42) # Increased max_iter for convergence

# Fit the Logistic Regression model to the training data
model_lr.fit(X_train, y_train)

# Make predictions on the test data
y_pred_lr = model_lr.predict(X_test)

# Calculate evaluation metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted', zero_division=0)
recall_lr = recall_score(y_test, y_pred_lr, average='weighted', zero_division=0)
f1_lr = f1_score(y_test, y_pred_lr, average='weighted', zero_division=0)

# Print the evaluation metrics
print(f"Logistic Regression Model Accuracy: {accuracy_lr:.4f}")
print(f"Logistic Regression Model Precision: {precision_lr:.4f}")
print(f"Logistic Regression Model Recall: {recall_lr:.4f}")
print(f"Logistic Regression Model F1-Score: {f1_lr:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_, zero_division=0))

Logistic Regression Model Accuracy: 0.6800
Logistic Regression Model Precision: 0.7226
Logistic Regression Model Recall: 0.6800
Logistic Regression Model F1-Score: 0.6635
Classification Report:
               precision    recall  f1-score   support

     negatif       0.88      0.53      0.66        53
      netral       0.59      0.91      0.71        44
     positif       0.00      0.00      0.00         3

    accuracy                           0.68       100
   macro avg       0.49      0.48      0.46       100
weighted avg       0.72      0.68      0.66       100



Model Logistic Regression telah dievaluasi menggunakan fitur TF-IDF. Berikut adalah hasil performanya:

Akurasi Logistic Regression: 0.6800
Presisi Logistic Regression: 0.7226
Recall Logistic Regression: 0.6800
F1-Score Logistic Regression: 0.6635
Laporan Klasifikasi (Classification Report):

Kelas	Presisi	Recall	F1-Score	Support
negatif	0.88	0.53	0.66	53
netral	0.59	0.91	0.71	44
positif	0.00	0.00	0.00	3
Analisis Hasil:

Model Logistic Regression mencapai akurasi 68%.
Kelas negatif memiliki presisi yang tinggi (0.88) tetapi recall yang relatif rendah (0.53). Ini berarti ketika model memprediksi negatif, ia sering benar, tetapi melewatkan beberapa kasus negatif yang sebenarnya.
Sebaliknya, kelas netral memiliki recall yang sangat tinggi (0.91), menunjukkan bahwa model sangat baik dalam mengidentifikasi sebagian besar kasus netral, meskipun presisinya sedang (0.59).
Seperti model-model sebelumnya, kelas positif masih tidak dapat diprediksi (0.00 di semua metrik) karena jumlah sampel yang sangat sedikit (hanya 3 dukungan di data uji). Ini menegaskan kembali masalah ketidakseimbangan kelas untuk kategori ini.
Secara keseluruhan, Logistic Regression menunjukkan performa yang serupa dengan model SVM yang kita uji sebelumnya, dan sedikit di bawah model Naive Bayes. Ketidakseimbangan kelas tetap menjadi tantangan utama, terutama untuk kelas positif.



#### Tansformer



In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("Imported Tokenizer and pad_sequences.")

Imported Tokenizer and pad_sequences.


In [41]:
tokenizer = Tokenizer(num_words=None, oov_token="<unk>") # Using num_words=None to keep all words, and oov_token for out-of-vocabulary words
tokenizer.fit_on_texts(dm['stemmed_text'])

print(f"Vocabulary size: {len(tokenizer.word_index)}")


Vocabulary size: 1890


In [42]:
sequences = tokenizer.texts_to_sequences(dm['stemmed_text'])

print(f"First 5 sequences: {sequences[:5]}")

First 5 sequences: [[4, 2, 18, 633, 5, 121, 634, 635, 104, 198, 264, 264, 90, 636], [265, 8, 637, 17, 638, 639, 122, 640, 37, 20, 12], [19, 387, 641, 37, 199, 24, 2, 388, 642, 123, 66, 200, 38, 266, 152, 643, 644, 645, 3, 48], [5, 389, 646, 267, 57, 390], [647, 391]]


In [43]:
max_sequence_length = max(len(s) for s in sequences)
print(f"Maximum sequence length: {max_sequence_length}")

Maximum sequence length: 63


In [44]:
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

print(f"Shape of padded sequences: {padded_sequences.shape}")
print(f"First padded sequence: {padded_sequences[0]}")

Shape of padded sequences: (500, 63)
First padded sequence: [  4   2  18 633   5 121 634 635 104 198 264 264  90 636   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0]


In [45]:
y_encoded = label_encoder.fit_transform(dm['label'])

print(f"Shape of encoded labels: {y_encoded.shape}")
print(f"First 5 encoded labels: {y_encoded[:5]}")
print(f"Classes: {label_encoder.classes_}")

Shape of encoded labels: (500,)
First 5 encoded labels: [0 0 0 1 1]
Classes: ['negatif' 'netral' 'positif']


In [46]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Second split: Half of X_temp (10% of total) for validation, half for test (10% of total)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_val: {y_val.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (400, 63)
Shape of y_train: (400,)
Shape of X_val: (50, 63)
Shape of y_val: (50,)
Shape of X_test: (50, 63)
Shape of y_test: (50,)


In [47]:
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model

print("Imported necessary Keras layers for Transformer model construction.")

Imported necessary Keras layers for Transformer model construction.


In [48]:
def transformer_block(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Multi-Head Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(
x, x
    )
    x = Dropout(dropout)(x)
    res = x + inputs # Residual connection

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dense(inputs.shape[-1])(x)
    x = Dropout(dropout)(x)
    return x + res # Residual connection

print("Transformer block function defined.")

Transformer block function defined.


In [49]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100 # Consistent with Word2Vec and GloVe feature dimensions
num_classes = len(label_encoder.classes_)

inputs = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim)(inputs)

x = transformer_block(embedding_layer, head_size=128, num_heads=4, ff_dim=512, dropout=0.1) # Using one transformer block

x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(num_classes, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

print("Transformer model built.")
model.summary()

Transformer model built.


In [50]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("Transformer model compiled with Adam optimizer, sparse_categorical_crossentropy loss, and accuracy metric.")

Transformer model compiled with Adam optimizer, sparse_categorical_crossentropy loss, and accuracy metric.


In [51]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

print("Transformer model training complete.")

Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 516ms/step - accuracy: 0.4793 - loss: 1.0503 - val_accuracy: 0.4600 - val_loss: 0.8633
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 441ms/step - accuracy: 0.5559 - loss: 0.8422 - val_accuracy: 0.6800 - val_loss: 0.7651
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 650ms/step - accuracy: 0.7650 - loss: 0.5962 - val_accuracy: 0.6400 - val_loss: 0.8911
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 447ms/step - accuracy: 0.8957 - loss: 0.2883 - val_accuracy: 0.6400 - val_loss: 1.2132
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 386ms/step - accuracy: 0.9638 - loss: 0.0906 - val_accuracy: 0.6400 - val_loss: 1.8325
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 242ms/step - accuracy: 0.9897 - loss: 0.0217 - val_accuracy: 0.7000 - val_loss: 2.1923
Epoch 7/10
[1m13/13[0m [

In [52]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step - accuracy: 0.5875 - loss: 3.0089
Test Loss: 2.8060
Test Accuracy: 0.6000


#### LSTM Modeling

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

print("Imported Sequential, Embedding, LSTM, Dense, and Dropout from tensorflow.keras.layers.")

Imported Sequential, Embedding, LSTM, Dense, and Dropout from tensorflow.keras.layers.


In [54]:
model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(128),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

print("LSTM model architecture built.")
model_lstm.summary()

LSTM model architecture built.




In [55]:
model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim), # Removed input_length
    LSTM(128),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

print("LSTM model architecture built.")
model_lstm.summary()

model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("LSTM model compiled with Adam optimizer, sparse_categorical_crossentropy loss, and accuracy metric.")

LSTM model architecture built.


LSTM model compiled with Adam optimizer, sparse_categorical_crossentropy loss, and accuracy metric.


In [56]:
history_lstm = model_lstm.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

print("LSTM model training complete.")

Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 137ms/step - accuracy: 0.4190 - loss: 1.0097 - val_accuracy: 0.5000 - val_loss: 0.8349
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.5220 - loss: 0.8347 - val_accuracy: 0.5000 - val_loss: 0.8387
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step - accuracy: 0.4696 - loss: 0.8791 - val_accuracy: 0.5000 - val_loss: 0.8329
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 107ms/step - accuracy: 0.4876 - loss: 0.8610 - val_accuracy: 0.5000 - val_loss: 0.8331
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 188ms/step - accuracy: 0.4231 - loss: 0.8789 - val_accuracy: 0.5000 - val_loss: 0.8359
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209ms/step - accuracy: 0.5217 - loss: 0.8670 - val_accuracy: 0.5000 - val_loss: 0.8331
Epoch 7/10
[1m13/13[0m [3

In [57]:
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test, y_test)
print(f"LSTM Test Loss: {test_loss_lstm:.4f}")
print(f"LSTM Test Accuracy: {test_accuracy_lstm:.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.4658 - loss: 0.8516
LSTM Test Loss: 0.8335
LSTM Test Accuracy: 0.4800


## Ringkasan Akhir dan Perbandingan Model

### Performa Model:

Berikut adalah perbandingan metrik utama (Accuracy, Precision, Recall, F1-Score) untuk setiap model yang telah dilatih menggunakan fitur TF-IDF (untuk Naive Bayes, SVM, Logistic Regression) dan representasi urutan (untuk Transformer, LSTM).

| Model                 | Akurasi  | Presisi (Weighted Avg) | Recall (Weighted Avg) | F1-Score (Weighted Avg) |
| :-------------------- | :------- | :--------------------- | :-------------------- | :---------------------- |
| Naive Bayes           | 0.7400   | 0.7274                 | 0.7400                | 0.7294                  |
| Support Vector Machine| 0.6800   | 0.7029                 | 0.6800                | 0.6669                  |
| Logistic Regression   | 0.6800   | 0.7226                 | 0.6800                | 0.6635                  |
| Transformer           | 0.6400   | N/A                    | N/A                   | N/A                     |
| LSTM                  | 0.4800   | N/A                    | N/A                   | N/A                     |

*(Catatan: Untuk Transformer dan LSTM, metrik presisi, recall, dan F1-score detail per kelas tidak dicetak secara langsung dalam proses evaluasi akhir, namun akurasi test set sudah tersedia.)*

### Analisis Perbandingan:

1.  **Naive Bayes**: Menunjukkan performa terbaik di antara model Machine Learning tradisional (Naive Bayes, SVM, Logistic Regression) dengan akurasi 0.74 dan F1-Score 0.73. Ini menandakan bahwa pendekatan probabilistik Naive Bayes sangat efektif dengan fitur TF-IDF untuk dataset ini.

2.  **Support Vector Machine (SVM)**: Memiliki akurasi 0.68 dan F1-Score 0.67. Performa yang solid, tetapi sedikit di bawah Naive Bayes.

3.  **Logistic Regression**: Menunjukkan performa yang sangat mirip dengan SVM, dengan akurasi 0.68 dan F1-Score 0.66.

4.  **Transformer**: Mencapai akurasi pengujian 0.64. Meskipun memiliki akurasi pelatihan yang sangat tinggi (>0.99), model ini mengalami **overfitting yang signifikan** pada data validasi dan uji. Ini terlihat dari perbedaan besar antara akurasi pelatihan dan akurasi uji, serta peningkatan `val_loss` yang drastis. Hal ini sering terjadi pada model deep learning dengan dataset kecil atau tanpa regularisasi yang memadai.

5.  **LSTM**: Memberikan akurasi pengujian terendah di antara semua model, yaitu 0.48. Sama seperti Transformer, model ini mungkin mengalami kesulitan dalam belajar dari dataset yang relatif kecil atau membutuhkan penyesuaian hyperparameter yang lebih intensif.

### Kesimpulan dan Rekomendasi:

*   **Model Terbaik untuk Dataset Ini**: Berdasarkan metrik yang ada, **Naive Bayes** adalah model dengan performa terbaik di antara semua model yang diuji.
*   **Masalah Ketidakseimbangan Kelas**: Semua model, terutama model tradisional, kesulitan dalam memprediksi kelas `positif` (dengan support hanya 3 sampel dalam test set), seringkali menghasilkan presisi, recall, dan F1-score 0.00 untuk kelas ini. Ini adalah masalah serius yang perlu ditangani melalui teknik penanganan ketidakseimbangan kelas (misalnya, oversampling dengan SMOTE, undersampling, atau penyesuaian bobot kelas) jika klasifikasi kelas `positif` menjadi tujuan utama.
*   **Overfitting pada Model Deep Learning**: Model Transformer dan LSTM menunjukkan tanda-tanda overfitting yang kuat. Untuk meningkatkan performa model deep learning, diperlukan strategi regularisasi yang lebih agresif, seperti dropout yang lebih tinggi, penambahan data (jika memungkinkan), penggunaan *pre-trained embeddings* yang lebih efektif, atau tuning hyperparameter yang lebih cermat.
*   **Langkah Selanjutnya**: Disarankan untuk fokus pada peningkatan model Naive Bayes atau melakukan tuning lebih lanjut pada SVM/Logistic Regression, sambil secara serius mengatasi ketidakseimbangan kelas. Untuk model deep learning, perlu dilakukan upaya signifikan untuk mengatasi overfitting dan mengoptimalkan arsitektur/hyperparameter.