Tokenizing, Text Filtering, dan Text Parsing

In [1]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# =====================================
# 0. Download resource NLTK
# =====================================
nltk.download('punkt')
nltk.download('stopwords')

# =====================================
# 1. Baca data dan ambil kolom penting
# =====================================
df = pd.read_csv("/content/Case Folding_Berita_CNBC_Tema_IKN.csv", sep=';', encoding='utf-8')
df = df[['Title_Original', 'Title_Casefolded', 'Content_Original', 'Content_Casefolded']]

# =====================================
# 2. Hapus pembuka khas CNBC
# =====================================
def remove_cnbc_opening(text):
    text = str(text).strip()

    # Bersihkan karakter aneh (Â, Â , \xa0, \u202f, dll)
    text = re.sub(r'[\xa0Â\u202f]+', ' ', text)

    # Hapus pembuka khas CNBC, misal:
    # jakarta, cnbc indonesia - ...
    # akarta, cnbc indonesia - ...
    # banjarmasin, cnbc indonesia — ...
    text = re.sub(
        r'([a-z\s]*)?,?\s*cnbc\s*indonesia\s*[-–—]\s*',
        '',
        text,
        flags=re.IGNORECASE
    )

    # Hapus bagian catatan opini
    text = re.sub(
        r'catatan\s*:\s*artikel\s*ini\s*merupakan\s*opini.*?(redaksi)?cnbcindonesia\.com',
        '',
        text,
        flags=re.IGNORECASE
    )

    # Hapus penanda dokumen/foto seperti (dok. …) atau cnbc indonesia/fotografer
    text = re.sub(r'\(.*?dok.*?\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'cnbc\s*indonesia\s*/\s*\w+', '', text, flags=re.IGNORECASE)

    # Bersihkan sisa spasi atau tanda baca ganda
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s*,\s*', ', ', text)
    text = re.sub(r'\s*\.\s*', '. ', text)

    return text.strip()

df['Content_Cleaned'] = df['Content_Casefolded'].apply(remove_cnbc_opening)

# =====================================
# 3. Tokenizing
# =====================================
def tokenize_text(text):
    return nltk.word_tokenize(str(text))

df['Tokens'] = df['Content_Cleaned'].apply(tokenize_text)

# =====================================
# 4. Text Filtering (hapus angka, tanda baca, stopwords)
# =====================================
stop_words = set(stopwords.words('indonesian'))

def filter_tokens(tokens):
    clean_tokens = [w for w in tokens if w.isalpha() and w.lower() not in stop_words]
    return clean_tokens

df['Filtered_Tokens'] = df['Tokens'].apply(filter_tokens)

# =====================================
# 5. Stemming (Text Parsing)
# =====================================
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

df['Stemmed_Tokens'] = df['Filtered_Tokens'].apply(stem_tokens)

# =====================================
# 6. Simpan hasil preprocessing
# =====================================
# Simpan hasil
df.to_csv("Hasil_Preprocessing_Berita_CNBC_Tema_IKN.csv", sep=';', index=False, encoding='utf-8')

# Download langsung
from google.colab import files
files.download("Hasil_Preprocessing_Berita_CNBC_Tema_IKN.csv")


print("✅ Preprocessing selesai! Hasil tiap tahap disimpan di cnbc_ikn_preprocessed_full.csv")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Preprocessing selesai! Hasil tiap tahap disimpan di cnbc_ikn_preprocessed_full.csv
