# Persiapan Data 

In [44]:
import pandas as pd

In [45]:
!pip install PySastrawi



In [46]:
df = pd.read_csv('expense_dataset_10000.csv')

print(f"\nContoh data:")
print(df.head())


Contoh data:
    description               category
0           gym  Kesehatan & Kebugaran
1     beli baju      Belanja & Hiburan
2         iklan   Bisnis & Profesional
3      suplemen  Kesehatan & Kebugaran
4  taksi online           Transportasi


# EDA

In [47]:
print("\nInformasi dataset:")
print(df.info())


Informasi dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  10000 non-null  object
 1   category     10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB
None


In [48]:
print("\nMissing values per kolom:")
print(df.isnull().sum())


Missing values per kolom:
description    0
category       0
dtype: int64


In [49]:
print("\nDistribusi kategori:")
print(df["category"].value_counts())


Distribusi kategori:
category
Travel & Liburan         887
Makanan & Minuman        860
Tabungan & Investasi     859
Transportasi             854
Bisnis & Profesional     839
Donasi & Hadiah          826
Belanja & Hiburan        824
Lain-lain                823
Kesehatan & Kebugaran    822
Perumahan & Akomodasi    822
Utang & Cicilan          799
Pendidikan               785
Name: count, dtype: int64


# Preprocessing Text

In [50]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk

# Unduh resource NLTK jika belum ada
def download_nltk_resources():
    resources = [
        ("corpora/stopwords", "stopwords"),
        ("tokenizers/punkt", "punkt"),
        ("tokenizers/punkt_tab", "punkt_tab")
    ]
    
    for path, resource in resources:
        try:
            nltk.data.find(path)
            print(f"Resource {resource} already downloaded.")
        except LookupError:
            print(f"Downloading {resource}...")
            nltk.download(resource)

# Jalankan fungsi untuk mengunduh resource
download_nltk_resources()

# Verifikasi bahwa stopwords dapat diakses
print("Stopwords tersedia:", len(stopwords.words("indonesian")), "kata")

Resource stopwords already downloaded.
Resource punkt already downloaded.
Resource punkt_tab already downloaded.
Stopwords tersedia: 758 kata


In [51]:
def preprocess_text(text):
    text = text.lower()  # Ubah ke huruf kecil
    text = re.sub(r'[^a-z\s]', '', text)  # Hapus tanda baca dan angka
    return text

df["description_clean"] = df["description"].apply(preprocess_text)

In [52]:
# Tokenisasi dan Stop Words Removal
stop_words_ind = set(stopwords.words("indonesian"))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words_ind]
    return " ".join(filtered_tokens)

df["description_filtered"] = df["description_clean"].apply(tokenize_and_remove_stopwords)

In [53]:
# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(text):
    return stemmer.stem(text)

df["description_stemmed"] = df["description_filtered"].apply(stem_text)

# Feature Engineering

In [54]:
# Ekstraksi Fitur TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Batasi fitur untuk efisiensi
X = tfidf_vectorizer.fit_transform(df["description_stemmed"])
y = df["category"]

print("Bentuk matriks TF-IDF:", X.shape)
print("Jumlah kategori unik:", y.nunique())

Bentuk matriks TF-IDF: (10000, 133)
Jumlah kategori unik: 12


# Pembagian Data & Pelatihan Model

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import pickle

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Ukuran data pelatihan (X_train, y_train):", X_train.shape, y_train.shape)
print("Ukuran data pengujian (X_test, y_test):", X_test.shape, y_test.shape)

Ukuran data pelatihan (X_train, y_train): (8000, 133) (8000,)
Ukuran data pengujian (X_test, y_test): (2000, 133) (2000,)


In [57]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

In [58]:
# Latih model Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

In [59]:
# Simpan model dan vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("logistic_regression_model.pkl", "wb") as f:
    pickle.dump(model_lr, f)

with open("naive_bayes_model.pkl", "wb") as f:
    pickle.dump(model_nb, f)

print("Model Logistic Regression dan Naive Bayes berhasil dilatih dan disimpan.")

Model Logistic Regression dan Naive Bayes berhasil dilatih dan disimpan.


In [60]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [61]:
# Evaluasi Model Logistic Regression
print("\n--- Evaluasi Model Logistic Regression ---")
y_pred_lr = model_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))


--- Evaluasi Model Logistic Regression ---
                       precision    recall  f1-score   support

    Belanja & Hiburan       1.00      1.00      1.00       157
 Bisnis & Profesional       1.00      1.00      1.00       179
      Donasi & Hadiah       1.00      1.00      1.00       166
Kesehatan & Kebugaran       1.00      1.00      1.00       159
            Lain-lain       1.00      1.00      1.00       153
    Makanan & Minuman       1.00      1.00      1.00       176
           Pendidikan       1.00      1.00      1.00       152
Perumahan & Akomodasi       1.00      1.00      1.00       172
 Tabungan & Investasi       1.00      1.00      1.00       199
         Transportasi       1.00      1.00      1.00       169
     Travel & Liburan       1.00      1.00      1.00       158
      Utang & Cicilan       1.00      1.00      1.00       160

             accuracy                           1.00      2000
            macro avg       1.00      1.00      1.00      2000
         

In [62]:
# Evaluasi Model Naive Bayes
print("\n--- Evaluasi Model Naive Bayes ---")
y_pred_nb = model_nb.predict(X_test)
print(classification_report(y_test, y_pred_nb))


--- Evaluasi Model Naive Bayes ---
                       precision    recall  f1-score   support

    Belanja & Hiburan       1.00      0.89      0.94       157
 Bisnis & Profesional       1.00      1.00      1.00       179
      Donasi & Hadiah       1.00      1.00      1.00       166
Kesehatan & Kebugaran       1.00      1.00      1.00       159
            Lain-lain       1.00      1.00      1.00       153
    Makanan & Minuman       1.00      1.00      1.00       176
           Pendidikan       1.00      1.00      1.00       152
Perumahan & Akomodasi       1.00      1.00      1.00       172
 Tabungan & Investasi       1.00      1.00      1.00       199
         Transportasi       1.00      1.00      1.00       169
     Travel & Liburan       0.90      1.00      0.95       158
      Utang & Cicilan       1.00      1.00      1.00       160

             accuracy                           0.99      2000
            macro avg       0.99      0.99      0.99      2000
         weighted

In [63]:
# Cross-validation untuk Logistic Regression
print("\n--- Cross-validation Logistic Regression ---")
scores_lr = cross_val_score(model_lr, X, y, cv=5) # 5-fold cross-validation
print("Accuracy (Logistic Regression): %0.2f (+/- %0.2f)" % (scores_lr.mean(), scores_lr.std() * 2))


--- Cross-validation Logistic Regression ---
Accuracy (Logistic Regression): 1.00 (+/- 0.00)


In [64]:
# Cross-validation untuk Naive Bayes
print("\n--- Cross-validation Naive Bayes ---")
scores_nb = cross_val_score(model_nb, X, y, cv=5) # 5-fold cross-validation
print("Accuracy (Naive Bayes): %0.2f (+/- %0.2f)" % (scores_nb.mean(), scores_nb.std() * 2))


--- Cross-validation Naive Bayes ---
Accuracy (Naive Bayes): 0.98 (+/- 0.02)
