## Import Library and Dataset

In [1]:
import pandas as pd

COL_NAMES = ['text', 'label']

train_df = pd.read_csv("./data/train_preprocess.tsv", sep='\t', names=COL_NAMES, header=None)
valid_df = pd.read_csv("./data/valid_preprocess.tsv", sep='\t', names=COL_NAMES, header=None)
test_df = pd.read_csv("./data/test_preprocess.tsv", sep='\t', names=COL_NAMES, header=None)

## Preprocessing

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import spacy
from spacy.lang.id import Indonesian
import re

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):

    stop_words = set(stopwords.words("indonesian"))
    stop_words = stop_words - {'tidak', 'bukan', 'kurang', 'tak'}
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # nlp = Indonesian()
    # doc = nlp(text)
    # filtered_words = [token.text for token in doc if token.pos_ in {'ADJ', 'VERB'}]
    # text = ' '.join(filtered_words)

    # factory = StemmerFactory()
    # stemmer = factory.create_stemmer()
    # text = stemmer.stem(text)
    
    return text

x_train = train_df['text'].apply(preprocess_text)
x_valid = valid_df['text'].apply(preprocess_text)
x_test = test_df['text'].apply(preprocess_text)

y_train = train_df['label']
y_valid = valid_df['label']
y_test = test_df['label']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Feature Extraction

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')

### Count Vectorizer

In [4]:
cv = CountVectorizer(lowercase=True, ngram_range=(1,2), tokenizer=token.tokenize)
x_train_cv = cv.fit_transform(x_train)
x_valid_cv = cv.transform(x_valid)
x_test_cv = cv.transform(x_test)



### TF-IDF Vectorizer

In [5]:
tf = TfidfVectorizer(lowercase=True, ngram_range=(1,2), tokenizer=token.tokenize)
x_train_tf = tf.fit_transform(x_train)
x_valid_tf = tf.transform(x_valid)
x_test_tf = tf.transform(x_test)

## Classification

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

### Naive Bayes + Count Vectorizer

In [7]:
nb_cv = MultinomialNB().fit(x_train_cv, y_train)
predicted = nb_cv.predict(x_valid_cv)
print("Akurasi validasi: " + str(accuracy_score(y_valid, predicted)))
print(classification_report(y_valid, predicted))

predicted = nb_cv.predict(x_test_cv)
print("Akurasi testing: " + str(accuracy_score(y_test, predicted)))
print(classification_report(y_test, predicted))

Akurasi validasi: 0.8476190476190476
              precision    recall  f1-score   support

    negative       0.79      0.80      0.80       394
     neutral       0.95      0.48      0.64       131
    positive       0.87      0.94      0.90       735

    accuracy                           0.85      1260
   macro avg       0.87      0.74      0.78      1260
weighted avg       0.85      0.85      0.84      1260

Akurasi testing: 0.696
              precision    recall  f1-score   support

    negative       0.67      0.93      0.78       204
     neutral       0.83      0.11      0.20        88
    positive       0.73      0.72      0.72       208

    accuracy                           0.70       500
   macro avg       0.74      0.59      0.57       500
weighted avg       0.72      0.70      0.65       500



### Logistic Regression + Count Vectorizer

In [8]:
lr_cv = LogisticRegression().fit(x_train_cv, y_train)
predicted = lr_cv.predict(x_valid_cv)
print("Akurasi validasi: " + str(accuracy_score(y_valid, predicted)))
print(classification_report(y_valid, predicted))

predicted = lr_cv.predict(x_test_cv)
print("Akurasi testing: " + str(accuracy_score(y_test, predicted)))
print(classification_report(y_test, predicted))

Akurasi validasi: 0.8857142857142857
              precision    recall  f1-score   support

    negative       0.83      0.88      0.85       394
     neutral       0.87      0.66      0.75       131
    positive       0.92      0.93      0.92       735

    accuracy                           0.89      1260
   macro avg       0.87      0.82      0.84      1260
weighted avg       0.89      0.89      0.88      1260

Akurasi testing: 0.802
              precision    recall  f1-score   support

    negative       0.78      0.92      0.84       204
     neutral       0.69      0.42      0.52        88
    positive       0.86      0.85      0.86       208

    accuracy                           0.80       500
   macro avg       0.77      0.73      0.74       500
weighted avg       0.80      0.80      0.79       500



### Random Forest + Count Vectorizer

In [9]:
forest_cv = RandomForestClassifier(n_estimators=100).fit(x_train_cv, y_train)
predicted = forest_cv.predict(x_valid_cv)
print("Akurasi validasi: " + str(accuracy_score(y_valid, predicted)))
print(classification_report(y_valid, predicted))

predicted = forest_cv.predict(x_test_cv)
print("Akurasi testing: " + str(accuracy_score(y_test, predicted)))
print(classification_report(y_test, predicted))

Akurasi validasi: 0.8571428571428571
              precision    recall  f1-score   support

    negative       0.82      0.80      0.81       394
     neutral       0.88      0.44      0.59       131
    positive       0.87      0.96      0.91       735

    accuracy                           0.86      1260
   macro avg       0.86      0.74      0.77      1260
weighted avg       0.86      0.86      0.85      1260

Akurasi testing: 0.714
              precision    recall  f1-score   support

    negative       0.65      0.90      0.76       204
     neutral       0.82      0.20      0.33        88
    positive       0.79      0.75      0.77       208

    accuracy                           0.71       500
   macro avg       0.75      0.62      0.62       500
weighted avg       0.74      0.71      0.69       500



### Naive Bayes + TF-IDF Vectorizer

In [10]:
nb_tf = MultinomialNB().fit(x_train_tf, y_train)
predicted = nb_tf.predict(x_valid_tf)
print("Akurasi validasi: " + str(accuracy_score(y_valid, predicted)))
print(classification_report(y_valid, predicted))

predicted = nb_tf.predict(x_test_cv)
print("Akurasi testing: " + str(accuracy_score(y_test, predicted)))
print(classification_report(y_test, predicted))

Akurasi validasi: 0.7825396825396825
              precision    recall  f1-score   support

    negative       0.82      0.63      0.71       394
     neutral       1.00      0.14      0.24       131
    positive       0.77      0.98      0.86       735

    accuracy                           0.78      1260
   macro avg       0.86      0.58      0.60      1260
weighted avg       0.81      0.78      0.75      1260

Akurasi testing: 0.652
              precision    recall  f1-score   support

    negative       0.80      0.66      0.72       204
     neutral       1.00      0.02      0.04        88
    positive       0.57      0.91      0.70       208

    accuracy                           0.65       500
   macro avg       0.79      0.53      0.49       500
weighted avg       0.74      0.65      0.60       500



### Logistic Regression + TF-IDF Vectorizer

In [11]:
lr_tf = LogisticRegression().fit(x_train_tf, y_train)
predicted = lr_tf.predict(x_valid_tf)
print("Akurasi validasi: " + str(accuracy_score(y_valid, predicted)))
print(classification_report(y_valid, predicted))

predicted = lr_tf.predict(x_test_tf)
print("Akurasi testing: " + str(accuracy_score(y_test, predicted)))
print(classification_report(y_test, predicted))

Akurasi validasi: 0.8753968253968254
              precision    recall  f1-score   support

    negative       0.81      0.86      0.83       394
     neutral       0.91      0.51      0.65       131
    positive       0.91      0.95      0.93       735

    accuracy                           0.88      1260
   macro avg       0.87      0.77      0.80      1260
weighted avg       0.88      0.88      0.87      1260

Akurasi testing: 0.762
              precision    recall  f1-score   support

    negative       0.71      0.94      0.81       204
     neutral       0.85      0.26      0.40        88
    positive       0.82      0.80      0.81       208

    accuracy                           0.76       500
   macro avg       0.79      0.67      0.67       500
weighted avg       0.78      0.76      0.74       500



### Random Forest + TF-IDF Vectorizer

In [12]:
forest_tf = RandomForestClassifier(n_estimators=100).fit(x_train_tf, y_train)
predicted = forest_tf.predict(x_valid_tf)
print("Akurasi validasi: " + str(accuracy_score(y_valid, predicted)))
print(classification_report(y_valid, predicted))

predicted = forest_tf.predict(x_test_tf)
print("Akurasi testing: " + str(accuracy_score(y_test, predicted)))
print(classification_report(y_test, predicted))

Akurasi validasi: 0.8626984126984127
              precision    recall  f1-score   support

    negative       0.81      0.82      0.82       394
     neutral       0.87      0.47      0.61       131
    positive       0.89      0.95      0.92       735

    accuracy                           0.86      1260
   macro avg       0.86      0.75      0.78      1260
weighted avg       0.86      0.86      0.86      1260

Akurasi testing: 0.682
              precision    recall  f1-score   support

    negative       0.62      0.89      0.73       204
     neutral       0.86      0.22      0.35        88
    positive       0.76      0.67      0.71       208

    accuracy                           0.68       500
   macro avg       0.75      0.59      0.60       500
weighted avg       0.72      0.68      0.66       500

