# **Perbandingan Algoritma Support Vector Machine, Rule-Based Classifier, dan Gradient Boosted Decision Tree dalam Analisis Sentimen**


---

KELOMPOK REKOGNISI


> Fachri Kurniansyah (M0721025)


> Felix (M0721028)

# Import Library

In [1]:
import asyncio
import matplotlib.pyplot as plt
import nest_asyncio
import nltk
import optuna
import pandas as pd
import re

from abc import ABC, abstractmethod
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\celle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\celle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import Data

In [3]:
column_names = ['text', 'label']
train_data = pd.read_csv('train_preprocess.tsv', sep='\t', header=None, names=column_names)
test_data = pd.read_csv('test_preprocess.tsv', sep='\t', header=None, names=column_names)

In [4]:
train_data

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10995,tidak kecewa,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10997,hormati partai-partai yang telah berkoalisi,neutral
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


In [5]:
test_data

Unnamed: 0,text,label
0,kemarin gue datang ke tempat makan baru yang a...,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative
...,...,...
495,kata nya tidur yang baik itu minimal enam jam ...,neutral
496,indonesia itu ada di benua asia .,neutral
497,salah satu kegemaran anak remaja indonesia sek...,neutral
498,melihat warna hijau bisa bikin mata jadi lebih...,positive


# Preprocessing Data

In [6]:
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()
stop_words = set(stopwords.words('indonesian'))
factory = StopWordRemoverFactory()
stop_words_sastrawi = set(factory.get_stop_words())
stop_words = stop_words.union(stop_words_sastrawi)
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

def clean_text(text):
    if type(text) == float:
        return ""
    temp = text.lower()
    temp = re.sub(r'^RT\s+', '', temp, flags=re.IGNORECASE).strip()
    temp = re.sub("@\S+","", temp)                # Remove mentions
    temp = re.sub("#[A-Za-z0-9_]+","", temp)      # Remove hashtags
    temp = re.sub(r"https\S+","", temp)           # Remove URLs
    temp = re.sub('[()!?]', '', temp)             # Remove specific punctuations
    temp = re.sub("\[.*?\]","", temp)             # Remove text inside square brackets
    temp = re.sub("[^a-z0-9\s]", "", temp)        # Remove non-alphanumeric characters (preserve spaces)
    temp = re.sub(r'[0-9]', '', temp)             # Remove digits
    temp = re.sub('\s+', ' ', temp).strip()       # Replace multiple spaces with a single space and strip leading/trailing spaces
    temp = ' '.join([word for word in temp.split() if word not in stop_words])
    temp = stemmer.stem(temp)
    return temp

train_data['clean'] = train_data['text'].apply(lambda x: clean_text(x))
test_data['clean'] = test_data['text'].apply(lambda x: clean_text(x))

  temp = re.sub("@\S+","", temp)                # Remove mentions
  temp = re.sub("\[.*?\]","", temp)             # Remove text inside square brackets
  temp = re.sub("[^a-z0-9\s]", "", temp)        # Remove non-alphanumeric characters (preserve spaces)
  temp = re.sub('\s+', ' ', temp).strip()       # Replace multiple spaces with a single space and strip leading/trailing spaces


# TF-IDF


In [7]:
X_train, y_train = train_data['clean'], train_data['label']
X_test, y_test = test_data['clean'], test_data['label']
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
print("Distribusi label:\n", y_train.value_counts())

Distribusi label:
 label
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64


# Machine Learning Modelling

In [9]:
class BaseModel(ABC):
    def __init__(self, name):
        self.name = name
        self.best_params = None
        self.model = None

    @abstractmethod
    def tune(self, X_train, y_train):
        pass

    @abstractmethod
    def train(self, X_train, y_train):
        pass

    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        return accuracy, report

In [10]:
class SVMModel(BaseModel):
    def __init__(self):
        super().__init__('SVM')

    def tune(self, X_train, y_train):
        def objective(trial):
            C = trial.suggest_loguniform('C', 1e-3, 1e3)
            kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
            gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
            degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3

            model = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree, random_state=42)
            score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
            return score.mean()

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=30)
        self.best_params = study.best_params

    def train(self, X_train, y_train):
        self.model = SVC(**self.best_params, random_state=42)
        self.model.fit(X_train, y_train)


In [11]:
class GBDTModel(BaseModel):
    def __init__(self):
        super().__init__('GBDT')

    def tune(self, X_train, y_train):
        def objective(trial):
            # Convert sparse to dense
            X_train_dense = X_train.toarray()

            # Hyperparameter tuning
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
            max_depth = trial.suggest_int('max_depth', 3, 10)
            max_iter = trial.suggest_int('max_iter', 50, 200)

            model = HistGradientBoostingClassifier(
                learning_rate=learning_rate, max_depth=max_depth, max_iter=max_iter, random_state=42
            )
            score = cross_val_score(model, X_train_dense, y_train, cv=3, scoring='accuracy')
            return score.mean()

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=30)
        self.best_params = study.best_params

    def train(self, X_train, y_train):
        # Convert sparse to dense
        X_train_dense = X_train.toarray()

        self.model = HistGradientBoostingClassifier(**self.best_params, random_state=42)
        self.model.fit(X_train_dense, y_train)


In [12]:
class RuleBasedModel(BaseModel):
    def __init__(self):
        super().__init__('Rule-Based')

    def tune(self, X_train, y_train):
        """
        Tune the DummyClassifier to select the best strategy.
        """
        def objective(trial):
            # Suggest a strategy to evaluate
            strategy = trial.suggest_categorical(
                'strategy', ['most_frequent', 'prior', 'stratified', 'uniform']
            )
            # Create and evaluate the model
            model = DummyClassifier(strategy=strategy)
            score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
            return score.mean()

        # Use Optuna to find the best strategy
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10)  # Limited trials since it's lightweight

        # Save the best parameters
        self.best_params = {'strategy': study.best_params['strategy']}
        print(f"Best strategy for Rule-Based: {self.best_params['strategy']}")

    def train(self, X_train, y_train):
        """
        Train the DummyClassifier with the selected strategy.
        """
        self.model = DummyClassifier(strategy=self.best_params['strategy'])
        self.model.fit(X_train, y_train)

In [13]:
async def process_model(model, X_train, y_train, X_test, y_test):
    print(f"Tuning {model.name}...")
    await asyncio.to_thread(model.tune, X_train, y_train)
    print(f"Best parameters for {model.name}: {model.best_params}")

    print(f"Training {model.name}...")
    await asyncio.to_thread(model.train, X_train, y_train)

    print(f"Evaluating {model.name}...")
    accuracy, report = model.evaluate(X_test, y_test)
    print(f"Accuracy for {model.name}: {accuracy}")
    print(report)
    return model.name, accuracy

async def main(models, X_train, y_train, X_test, y_test):
    tasks = [process_model(model, X_train, y_train, X_test, y_test) for model in models]
    results = await asyncio.gather(*tasks)
    return results


In [None]:
# Instantiate models
svm_model = SVMModel()
gbdt_model = GBDTModel()
rule_based_model = RuleBasedModel()

models = [svm_model, gbdt_model, rule_based_model]

# Run all models asynchronously
results = asyncio.run(main(models, X_train_tfidf, y_train, X_test_tfidf, y_test))

# Compare Results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy'])
print(results_df.sort_values(by='Accuracy', ascending=False))

[I 2024-12-25 13:43:12,302] A new study created in memory with name: no-name-cd4c0887-6c2a-4bac-b32d-900ff9b85e14
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
[I 2024-12-25 13:43:12,303] A new study created in memory with name: no-name-d91bffe8-48ba-45ae-90a0-c5dd0785e54b
  gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
[I 2024-12-25 13:43:12,304] A new study created in memory with name: no-name-57037961-46c9-460d-b613-9aebc841500d
[I 2024-12-25 13:43:12,358] Trial 0 finished with value: 0.5832727203840468 and parameters: {'strategy': 'prior'}. Best is trial 0 with value: 0.5832727203840468.


[I 2024-12-25 13:43:12,380] Trial 1 finished with value: 0.4505461810668107 and parameters: {'strategy': 'stratified'}. Best is trial 0 with value: 0.5832727203840468.
[I 2024-12-25 13:43:12,399] Trial 2 finished with value: 0.34472737760833433 and parameters: {'strategy': 'uniform'}. Best is trial 0 with value: 0.5832727203840468.
[I 2024-12-25 13:43:12,416] Trial 3 finished with value: 0.5832727203840468 and parameters: {'strategy': 'most_frequent'}. Best is trial 0 with value: 0.5832727203840468.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
[I 2024-12-25 13:43:12,441] Trial 4 finished with value: 0.4486356271832254 and parameters: {'strategy': 'stratified'}. Best is trial 0 with value: 0.5832727203840468.


Tuning SVM...
Tuning GBDT...
Tuning Rule-Based...


[I 2024-12-25 13:43:12,458] Trial 5 finished with value: 0.34518279918311245 and parameters: {'strategy': 'uniform'}. Best is trial 0 with value: 0.5832727203840468.
[I 2024-12-25 13:43:12,481] Trial 6 finished with value: 0.4501824041885197 and parameters: {'strategy': 'stratified'}. Best is trial 0 with value: 0.5832727203840468.
[I 2024-12-25 13:43:12,499] Trial 7 finished with value: 0.5832727203840468 and parameters: {'strategy': 'most_frequent'}. Best is trial 0 with value: 0.5832727203840468.
[I 2024-12-25 13:43:12,517] Trial 8 finished with value: 0.45272690827640377 and parameters: {'strategy': 'stratified'}. Best is trial 0 with value: 0.5832727203840468.
[I 2024-12-25 13:43:12,535] Trial 9 finished with value: 0.33690884025669837 and parameters: {'strategy': 'uniform'}. Best is trial 0 with value: 0.5832727203840468.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(av

Best strategy for Rule-Based: prior
Best parameters for Rule-Based: {'strategy': 'prior'}
Training Rule-Based...
Evaluating Rule-Based...
Accuracy for Rule-Based: 0.416
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       204
     neutral       0.00      0.00      0.00        88
    positive       0.42      1.00      0.59       208

    accuracy                           0.42       500
   macro avg       0.14      0.33      0.20       500
weighted avg       0.17      0.42      0.24       500



[I 2024-12-25 13:43:31,228] Trial 0 finished with value: 0.8471813775993087 and parameters: {'C': 681.5753710157569, 'kernel': 'rbf', 'gamma': 0.0011028492644940728}. Best is trial 0 with value: 0.8471813775993087.
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
  gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
[I 2024-12-25 13:44:02,439] Trial 1 finished with value: 0.5832727203840468 and parameters: {'C': 4.815398881421919, 'kernel': 'rbf', 'gamma': 0.00032308548110339673}. Best is trial 0 with value: 0.8471813775993087.
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
  gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
[I 2024-12-25 13:44:31,681] Trial 2 finished with value: 0.5832727203840468 and parameters: {'C': 1.075280970650158, 'kernel': 'sigmoid', 'gamma': 0.00026448695406225273}. Best is trial 0 with value: 0.8471813775993087.
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
  gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
[I 2024-12-25 13:44:59,071] Trial 3 finishe

Best parameters for SVM: {'C': 681.5753710157569, 'kernel': 'rbf', 'gamma': 0.0011028492644940728}
Training SVM...
Evaluating SVM...
Accuracy for SVM: 0.744
              precision    recall  f1-score   support

    negative       0.70      0.88      0.78       204
     neutral       0.68      0.47      0.55        88
    positive       0.82      0.73      0.77       208

    accuracy                           0.74       500
   macro avg       0.74      0.69      0.70       500
weighted avg       0.75      0.74      0.74       500



[I 2024-12-25 14:02:30,365] Trial 3 finished with value: 0.7781812276848511 and parameters: {'learning_rate': 0.016155406661892423, 'max_depth': 10, 'max_iter': 79}. Best is trial 0 with value: 0.8333631624918492.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
[I 2024-12-25 14:08:50,641] Trial 4 finished with value: 0.8134540216623662 and parameters: {'learning_rate': 0.04007830204595277, 'max_depth': 6, 'max_iter': 194}. Best is trial 0 with value: 0.8333631624918492.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
[I 2024-12-25 14:13:30,712] Trial 5 finished with value: 0.7999089057667872 and parameters: {'learning_rate': 0.03787848938179212, 'max_depth': 6, 'max_iter': 126}. Best is trial 0 with value: 0.8333631624918492.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
[I 2024-12-25 14:17:24,205] Trial 6 finished with value: 0.8168178481815347 and parameters: {'learning_rate': 0.07512218844443345, 'max_depth': 10,

In [None]:
results_df.to_csv("results.csv", index=False)