In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('indonesian'))
    stop_words = stop_words - {'tidak', 'kurang', 'bukan', 'tak', 'belum', 'enggak', 'enggaklah', 'janganlah', 'jangan', 'jgn', 'jgnlah', 'nggak', 'nggaklah', 'ga', 'gak', 'gaklah', 'tdk', 'tdklah', 'taklah', 'kuranglah', 'bukanlah', 'tidaklah'}
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [2]:
# Load data
train_data = pd.read_csv('data/train_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
valid_data = pd.read_csv('data/valid_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
test_data = pd.read_csv('data/test_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])

# Preprocess data
X_train = train_data['text'].apply(preprocess_text)
y_train = train_data['label']

X_valid = valid_data['text'].apply(preprocess_text)
y_valid = valid_data['label']

X_test = test_data['text'].apply(preprocess_text)
y_test = test_data['label']

In [3]:
# Use Bag of Words
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,2))
X_train_bow = vectorizer.fit_transform(X_train)
X_valid_bow = vectorizer.transform(X_valid)
X_test_bow = vectorizer.transform(X_test)

# Logistic Regression

In [None]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_bow, y_train)

y_valid_pred = lr_model.predict(X_valid_bow)
print('Validation Accuracy:', accuracy_score(y_valid, y_valid_pred))
print('Validation Classification Report:\n', classification_report(y_valid, y_valid_pred))

y_test_pred = lr_model.predict(X_test_bow)
print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))

# Naive Bayes

In [None]:
# Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

y_valid_pred = nb_model.predict(X_valid_bow)
print('Validation Accuracy:', accuracy_score(y_valid, y_valid_pred))
print('Validation Classification Report:\n', classification_report(y_valid, y_valid_pred))

y_test_pred = nb_model.predict(X_test_bow)
print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))

# SVM

In [None]:
# SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_bow, y_train)

y_valid_pred = svm_model.predict(X_valid_bow)
print('Validation Accuracy:', accuracy_score(y_valid, y_valid_pred))
print('Validation Classification Report:\n', classification_report(y_valid, y_valid_pred))

y_test_pred = svm_model.predict(X_test_bow)
print('Test Accuracy:', accuracy_score(y_test, y_test_pred))
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))