In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('indonesian'))
    stop_words = stop_words - {'tidak', 'kurang', 'bukan', 'tak', 'belum', 'enggak', 'enggaklah', 'janganlah', 'jangan', 'jgn', 'jgnlah', 'nggak', 'nggaklah', 'ga', 'gak', 'gaklah', 'tdk', 'tdklah', 'taklah', 'kuranglah', 'bukanlah', 'tidaklah'}
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

[nltk_data] Downloading package punkt to C:\Users\S W I F T
[nltk_data]     X\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\S W I F T
[nltk_data]     X\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\S W I F T
[nltk_data]     X\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\S W I F T
[nltk_data]     X\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load data
train_data = pd.read_csv('data/train_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
valid_data = pd.read_csv('data/valid_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
test_data = pd.read_csv('data/test_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])

# Preprocess data
X_train = train_data['text'].apply(preprocess_text)
y_train = train_data['label']

X_valid = valid_data['text'].apply(preprocess_text)
y_valid = valid_data['label']

X_test = test_data['text'].apply(preprocess_text)
y_test = test_data['label']

In [3]:
# Use Bag of Words
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,2))
X_train_bow = vectorizer.fit_transform(X_train)
X_valid_bow = vectorizer.transform(X_valid)
X_test_bow = vectorizer.transform(X_test)

# Logistic Regression

In [4]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_bow, y_train)

y_valid_pred = lr_model.predict(X_valid_bow)
print('Validation Accuracy:', accuracy_score(y_valid, y_valid_pred))
print('Validation Classification Report:\n', classification_report(y_valid, y_valid_pred))

y_test_pred = lr_model.predict(X_test_bow)
print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))

Validation Accuracy: 0.8873015873015873
Validation Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.89      0.86       394
     neutral       0.86      0.64      0.73       131
    positive       0.92      0.93      0.93       735

    accuracy                           0.89      1260
   macro avg       0.87      0.82      0.84      1260
weighted avg       0.89      0.89      0.89      1260

Test Accuracy: 0.808
Test Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.95      0.86       204
     neutral       0.71      0.41      0.52        88
    positive       0.87      0.84      0.85       208

    accuracy                           0.81       500
   macro avg       0.78      0.73      0.74       500
weighted avg       0.80      0.81      0.80       500



# Naive Bayes

In [5]:
# Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

y_valid_pred = nb_model.predict(X_valid_bow)
print('Validation Accuracy:', accuracy_score(y_valid, y_valid_pred))
print('Validation Classification Report:\n', classification_report(y_valid, y_valid_pred))

y_test_pred = nb_model.predict(X_test_bow)
print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))

Validation Accuracy: 0.8507936507936508
Validation Classification Report:
               precision    recall  f1-score   support

    negative       0.79      0.81      0.80       394
     neutral       0.97      0.47      0.64       131
    positive       0.87      0.94      0.90       735

    accuracy                           0.85      1260
   macro avg       0.88      0.74      0.78      1260
weighted avg       0.86      0.85      0.84      1260

Test Accuracy: 0.698
Test Classification Report:
               precision    recall  f1-score   support

    negative       0.67      0.92      0.78       204
     neutral       0.83      0.11      0.20        88
    positive       0.72      0.73      0.73       208

    accuracy                           0.70       500
   macro avg       0.74      0.59      0.57       500
weighted avg       0.72      0.70      0.65       500



# SVM

In [6]:
# SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_bow, y_train)

y_valid_pred = svm_model.predict(X_valid_bow)
print('Validation Accuracy:', accuracy_score(y_valid, y_valid_pred))
print('Validation Classification Report:\n', classification_report(y_valid, y_valid_pred))

y_test_pred = svm_model.predict(X_test_bow)
print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))

Validation Accuracy: 0.8674603174603175
Validation Classification Report:
               precision    recall  f1-score   support

    negative       0.80      0.88      0.84       394
     neutral       0.78      0.58      0.66       131
    positive       0.92      0.91      0.92       735

    accuracy                           0.87      1260
   macro avg       0.83      0.79      0.81      1260
weighted avg       0.87      0.87      0.87      1260

Test Accuracy: 0.778
Test Classification Report:
               precision    recall  f1-score   support

    negative       0.75      0.90      0.82       204
     neutral       0.66      0.38      0.48        88
    positive       0.84      0.83      0.83       208

    accuracy                           0.78       500
   macro avg       0.75      0.70      0.71       500
weighted avg       0.77      0.78      0.77       500

