# KLASIFIKASI DATA

## IMPORT LIBRARY

In [None]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models import Word2Vec

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [20]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\62877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\62877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\62877\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## LOAD DATASET

In [21]:
df = pd.read_csv('data_label_manual.csv')

print("Kolom dataset:", df.columns)
df.head()

Kolom dataset: Index(['clean_text', 'sentimen'], dtype='object')


Unnamed: 0,clean_text,sentimen
0,yes betul sebagai pelatih timnas menangani tim...,positif
1,gua gak peduli siapa yang main yang penting ti...,positif
2,peluang timnas indonesia ke piala dunia masih ...,positif
3,berharap menang dari jepang dianggap gak masuk...,positif
4,iya iya gak boleh ngarep menang dari jepang ga...,positif


## PREPROCESSING (Stemming & Lemmatization)

In [None]:
stop_words = set(stopwords.words('indonesian'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()

    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]

    tokens = [stemmer.stem(t) for t in tokens]

    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return " ".join(tokens)

df['clean_text'] = df.iloc[:, 0].apply(preprocess_text)

## SPLIT DATA

In [23]:
X = df['clean_text']
y = df.iloc[:, 1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## EKSTRASI FITUR

### BOW

In [24]:
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

### TF-IDF

In [25]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Word2Vec

In [26]:
sentences = [text.split() for text in X_train]

w2v = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

def w2v_vector(text):
    vectors = [w2v.wv[word] for word in text.split() if word in w2v.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X_train_w2v = np.array([w2v_vector(text) for text in X_train])
X_test_w2v = np.array([w2v_vector(text) for text in X_test])

## KLASIFIKASI

### NAIVE BAES

In [31]:
nb = MultinomialNB()

nb.fit(X_train_bow, y_train)
pred_nb_bow = nb.predict(X_test_bow)

print("Naive Bayes menggunakan BoW")
print(classification_report(y_test, pred_nb_bow))

Naive Bayes menggunakan BoW
              precision    recall  f1-score   support

     negatif       0.62      0.80      0.70        20
      netral       1.00      0.40      0.57        10
     positif       0.70      0.70      0.70        20

    accuracy                           0.68        50
   macro avg       0.77      0.63      0.66        50
weighted avg       0.73      0.68      0.67        50



In [34]:
nb.fit(X_train_tfidf, y_train)
pred_nb_tfidf = nb.predict(X_test_tfidf)

print("Naive Bayes menggunakan TF-IDF")
print(classification_report(y_test, pred_nb_tfidf))

Naive Bayes menggunakan TF-IDF
              precision    recall  f1-score   support

     negatif       0.58      0.75      0.65        20
      netral       1.00      0.30      0.46        10
     positif       0.62      0.65      0.63        20

    accuracy                           0.62        50
   macro avg       0.73      0.57      0.58        50
weighted avg       0.68      0.62      0.61        50



### DECISION TREE

In [35]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_tfidf, y_train)
pred_dt = dt.predict(X_test_tfidf)

print("Decision Tree menggunakan TF-IDF")
print(classification_report(y_test, pred_dt))

Decision Tree menggunakan TF-IDF
              precision    recall  f1-score   support

     negatif       0.79      0.75      0.77        20
      netral       0.53      0.80      0.64        10
     positif       0.69      0.55      0.61        20

    accuracy                           0.68        50
   macro avg       0.67      0.70      0.67        50
weighted avg       0.70      0.68      0.68        50



### SVM

In [36]:
svm = SVC(kernel='linear')
svm.fit(X_train_tfidf, y_train)
pred_svm = svm.predict(X_test_tfidf)

print("SVM menggunakan TF-IDF")
print(classification_report(y_test, pred_svm))

SVM menggunakan TF-IDF
              precision    recall  f1-score   support

     negatif       0.65      0.75      0.70        20
      netral       1.00      0.40      0.57        10
     positif       0.65      0.75      0.70        20

    accuracy                           0.68        50
   macro avg       0.77      0.63      0.66        50
weighted avg       0.72      0.68      0.67        50



### LOGISTIC REGRESSION

In [37]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression menggunakan TF-IDF")
print(classification_report(y_test, pred_lr))


Logistic Regression menggunakan TF-IDF
              precision    recall  f1-score   support

     negatif       0.62      0.75      0.68        20
      netral       1.00      0.40      0.57        10
     positif       0.68      0.75      0.71        20

    accuracy                           0.68        50
   macro avg       0.77      0.63      0.66        50
weighted avg       0.72      0.68      0.67        50



In [None]:
lr_w2v = LogisticRegression(max_iter=1000)
lr_w2v.fit(X_train_w2v, y_train)
pred_w2v = lr_w2v.predict(X_test_w2v)

print("Logistic Regression menggunakan Word2Vec")
print(classification_report(y_test, pred_w2v))

Logistic Regression menggunakan Word2Vec
              precision    recall  f1-score   support

     negatif       0.46      0.65      0.54        20
      netral       0.00      0.00      0.00        10
     positif       0.36      0.40      0.38        20

    accuracy                           0.42        50
   macro avg       0.28      0.35      0.31        50
weighted avg       0.33      0.42      0.37        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
