In [1]:
!pip install pandas sastrawi nltk

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [3]:
# --- Membaca File Hasil Augmentasi ---
input_filename = '../Data/hasil_augmentasi_transjakarta.csv'
try:
    df = pd.read_csv(input_filename)
    print(f"File '{input_filename}' berhasil dibaca. Jumlah data: {len(df)} baris.")
except FileNotFoundError:
    print(f"Error: File '{input_filename}' tidak ditemukan.")
    exit()

File '../Data/hasil_augmentasi_transjakarta.csv' berhasil dibaca. Jumlah data: 164 baris.


In [4]:
# --- Inisialisasi Ulang Fungsi Preprocessing ---
def case_folding(text):
    return text.lower()

def clean_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

stopword_factory = StopWordRemoverFactory()
stopword_list = stopword_factory.get_stop_words()
stopword_list.extend(['rp', 'dki', 'jakarta'])
stopword_remover = stopword_factory.create_stop_word_remover()

def remove_stopwords(text):
    return stopword_remover.remove(text)

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

def stem_text(text):
    return stemmer.stem(text)

In [5]:
# --- Menjalankan Ulang Pipeline Preprocessing ---
print("\nMemulai proses preprocessing ulang pada data yang sudah di-augmentasi...")

def preprocess_pipeline(text):
    if not isinstance(text, str):
        return ""
    text = case_folding(text)
    text = clean_text(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

df['Teks_Stemmed'] = df['Teks_Lengkap'].apply(preprocess_pipeline)
print("Preprocessing ulang selesai.")

# Hapus baris yang mungkin menjadi kosong setelah preprocessing
df.dropna(subset=['Teks_Stemmed'], inplace=True)
df = df[df['Teks_Stemmed'].str.strip() != '']


Memulai proses preprocessing ulang pada data yang sudah di-augmentasi...
Preprocessing ulang selesai.


In [6]:
# --- Menyimpan Hasil Akhir ---
output_filename = '../Data/hasil_preprocessed_augmented.csv'
df.to_csv(output_filename, index=False, encoding='utf-8')

print(f"\n✅ Proses selesai. Data yang bersih dan seimbang disimpan di file: {output_filename}")
print(f"Jumlah data final yang siap untuk model: {len(df)}")
print("\nDistribusi Sentimen Final:")
print(df['Sentimen'].value_counts())


✅ Proses selesai. Data yang bersih dan seimbang disimpan di file: ../Data/hasil_preprocessed_augmented.csv
Jumlah data final yang siap untuk model: 164

Distribusi Sentimen Final:
Sentimen
positif    84
negatif    52
netral     28
Name: count, dtype: int64
