In [2]:
import pandas as pd
import re
import regex
from indoNLP.preprocessing import replace_slang, replace_word_elongation

In [3]:
dataset = pd.read_csv(r'../dataset/dataset.csv', sep=';')

In [4]:
dataset

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik
...,...,...
4995,RT @l1DsGGe2xObT3t72dHwqlT58X7jvEYtEnauZIZSYwS...,Politik
4996,Masyarakat yakin bahwa Prabowo-Gibran memiliki...,Politik
4997,imo both are irrational but yg satu jauh lebih...,Ekonomi
4998,@cIIGSdjaPrmAQY1E4gWnLpIZSdyQn8ZMhjJzgOsxfRM= ...,Pertahanan dan Keamanan


In [5]:
HASHTAG_PATTERN = re.compile(r'#\w+')
MENTION_PATTERN = re.compile(r'@\S+\s*')
URL_PATTERN = re.compile(r'https?://\S+')
RE_BRACKET_PATTERN = re.compile(r'\[RE[^\]]*\]')
NON_ASCII_PATTERN = re.compile(r'[^\x00-\x7F]+')
DOT_BETWEEN_NUMBERS_PATTERN = re.compile(r'(?<=\d)\.(?=\d)')
PUNCTUATION_PATTERN = regex.compile(r'[\p{P}\p{S}]')
RT_PATTERN = re.compile(r'\bRT\b')
REPEAT2_PATTERN = re.compile(r'^[a-zA-Z]+2$')

def split_hashtag(match):
    """Split hashtag into words based on camel case and number-letter transitions."""
    word = match.group(0)[1:]
    word = re.sub(r'(?<=[a-z])([A-Z])', r' \1', word)
    word = re.sub(r'([A-Za-z])(\d)', r'\1 \2', word)
    word = re.sub(r'(\d)([A-Za-z])', r'\1 \2', word)
    return word

def repeat_word_if_endswith2(word):
    """Repeat the word if it ends with '2'."""
    return f"{word[:-1]} {word[:-1]}" if REPEAT2_PATTERN.match(word) else word

def replace_number_suffixes(text):
    """Replace %, jt, m, t with Indonesian words."""
    text = re.sub(r'(\d+)\s*%', r'\1 persen', text)
    text = re.sub(r'(\d+)\s*jt\b', r'\1 juta', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)\s*m\b', r'\1 milyar', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)\s*t\b', r'\1 triliun', text, flags=re.IGNORECASE)
    return text

def clean_text(text):
    """Clean and normalize Indonesian tweet text."""
    text = str(text)
    text = RT_PATTERN.sub('', text)
    text = MENTION_PATTERN.sub('', text)
    text = URL_PATTERN.sub('', text)
    text = RE_BRACKET_PATTERN.sub('', text)
    text = HASHTAG_PATTERN.sub(split_hashtag, text)
    text = NON_ASCII_PATTERN.sub(' ', text)
    text = DOT_BETWEEN_NUMBERS_PATTERN.sub('', text)
    text = PUNCTUATION_PATTERN.sub(' ', text)
    text = ' '.join(repeat_word_if_endswith2(w) for w in text.split())
    text = replace_number_suffixes(text)
    text = replace_word_elongation(text)
    text = replace_slang(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

In [9]:
# Apply preprocessing to the tweet column
dataset['cleaned_text'] = dataset['text'].astype(str).apply(clean_text)
dataset = dataset[['text', 'cleaned_text', 'label']]
dataset

Unnamed: 0,text,cleaned_text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,anies dapat tepuk tangan meriah saat jadi rekt...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,memang benar sih pendukung 01 ada yang goblok ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,sewaktu anies bersikap kritis ke kinerja pak p...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,anies baswedan harap asn termasuk tni dan polr...,Politik
...,...,...,...
4995,RT @l1DsGGe2xObT3t72dHwqlT58X7jvEYtEnauZIZSYwS...,melihat debat kemarin pas prabowo kicep kekira...,Politik
4996,Masyarakat yakin bahwa Prabowo-Gibran memiliki...,masyarakat yakin bahwa prabowo gibran memiliki...,Politik
4997,imo both are irrational but yg satu jauh lebih...,imo both are irrational but yang satu jauh leb...,Ekonomi
4998,@cIIGSdjaPrmAQY1E4gWnLpIZSdyQn8ZMhjJzgOsxfRM= ...,look at that pak ganjar anda sudah berkecimpun...,Pertahanan dan Keamanan
