In [1]:
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
nltk.download('punkt')
nltk.download("punkt_tab")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bugi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Bugi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bugi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
dataset = pd.read_csv("./data/cleaned_dataset.csv")
dataset.rename(columns={"sentiment": "sentiment_by_score"}, inplace=True)
dataset.head(3)

Unnamed: 0,content,score,sentiment_by_score
0,Kurang update fiturnya.,3,Neutral
1,halte tidak lengkap. padahal ada penanda bus t...,2,Negative
2,Kedatangan lama padahal posisinya tinggal pute...,1,Negative


In [7]:
def clean_text(text:str):
    """clean the text by removing non-alphabet characters, extra whitespaces, non-ascii characters, and numbers

    Args:
        text (str): the text to be cleaned

    Returns:
        str: the cleaned text
    """
    text = text.lower()
    text = text.strip() # remove leading/trailing whitespaces
    text = re.sub(r'[^a-z\s]', '', text) # remove non-alphabet characters
    text = re.sub(r'\s+', ' ', text) # remove extra whitespaces
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove non-ascii characters
    text = re.sub(r'[0-9]+', '', text) # remove numbers
    return text

In [8]:
def rm_stopwords(text:str):
    """remove stopwords from the text

    Args:
        text (str): the text to be processed

    Returns:
        str: the text without stopwords
    """
    stop_words = set(stopwords.words('indonesian'))
    stop_words.update(set(stopwords.words('english')))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [9]:
def stemming(text:str):
    """remove affixes from the words in the text

    Args:
        text (str): the text to be processed

    Returns:
        str: the text with words that have been stemmed
    """
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [24]:
def fix_word(text:str):
    """correct some words in the text

    Args:
        text (str): the text to be processed

    Returns:
        str: the text with corrected words
    """
    corrected = {"yg":"yang",
    "saj":"saja",
    "cmn":"cuman",
    "yh":"ya",
    "klo":"kalau",
    "ad":"ada",
    "g":"tidak",
    "tp":"tapi",
    "yah":"ya",
    "cmk":"cuma",
    "bgt":"banget",
    "knp":"kenapa",
    "kak":"kakak",
    "ga":"tidak",
    "moga":"semoga",
    "smpe":"sampai",
    "skrng":"sekarang",
    "belm":"belum",
    "knapa":"kenapa",
    "ga":"tidak",
    "mo":"mau",
    "muantaaaabbbb":"mantap",
    "didownloadpdhal":"download padahal",
    "bgtu":"begitu",
    "udh":"sudah",
    "tidk":"tidak",
    "untk":"untuk",
    "eror":"error",
    "sayangat":"sangat",
    "telp":"telepon"
    }
    words = word_tokenize(text)
    for index, word in enumerate(words):
        if word in corrected:
            words[index] = corrected[word]
    return ' '.join(words)

In [12]:
dataset.content.head(15)

0                               Kurang update fiturnya.
1     halte tidak lengkap. padahal ada penanda bus t...
2     Kedatangan lama padahal posisinya tinggal pute...
3     Hallo kapan Kota Manado (Sulawesi Utara) bisa ...
4     Tolong dishub, supaya di sediakan rute dan ako...
5     Aplikasinya mbantu banget. Gatau apa yang kura...
6                                        bintang 3 dulu
7     Tolong untuk bus yang sedang tidak beroperasi ...
8                               bagus sangat bermanfaat
9                                                  Good
10             sangat terbantu, terimakasih mitra darat
11    Mas Muchi semangat yaaaa, sehat sehat biar apl...
12                                         keren banget
13    ini gimana nih, bis nya gak ke tracking, saya ...
14    Teman bus Makassar Panakukang galesong pelayan...
Name: content, dtype: object

In [13]:
cleaned_dataset = dataset.copy()
print(f"Data shape before cleaning: {cleaned_dataset.shape}", "start cleaning the characters", sep='\n')
cleaned_dataset.content = cleaned_dataset.content.apply(clean_text)
print(f"Current data shape: {cleaned_dataset.shape}", "start cleaning the stopword", sep='\n')
cleaned_dataset.content = cleaned_dataset.content.apply(rm_stopwords)
print(f"Current data shape: {cleaned_dataset.shape}", "start cleaning the stem word", sep='\n')
cleaned_dataset.content = cleaned_dataset.content.apply(stemming)
print(f"Data shape after cleaning: {cleaned_dataset.shape}", "finished the cleaning process", sep='\n')

Data shape before cleaning: (1660, 3)
start cleaning the characters
Current data shape: (1660, 3)
start cleaning the stopword
Current data shape: (1660, 3)
start cleaning the stem word
Data shape after cleaning: (1660, 3)
finished the cleaning process


In [20]:
cleaned_dataset.head(30)

Unnamed: 0,content,score,sentiment_by_score
0,update fiturnya,3,Neutral
1,halte lengkap tanda bus trans metro deli kompl...,2,Negative
2,datang posisi tinggal puter cileunyi,1,Negative
3,hallo kota manado sulawesi utara teman bus mak...,5,Positive
4,tolong dishub sedia rute akomodasi teman bus j...,1,Negative
5,aplikasi mbantu banget gatau udah top lengkap ...,5,Positive
6,bintang,3,Neutral
7,tolong bus operasi tanda tampil aplikasi nungg...,3,Neutral
8,bagus manfaat,5,Positive
9,good,5,Positive


In [25]:
cleaned_dataset.content = cleaned_dataset.content.apply(fix_word)

In [26]:
cleaned_dataset.tail(10)

Unnamed: 0,content,score,sentiment_by_score
1650,aplikasi bagus keren,5,Positive
1651,min fiturnya aktif aja barusan buka apknya sis...,5,Positive
1652,ok cuma loading lot,5,Positive
1653,icon mudik gratis muncul ya aplikasi min,5,Positive
1654,semoga mudik lancar,5,Positive
1655,dapet otp nya sampai sekarang belum dapet,2,Negative
1656,kenapa tidak login min kode nya tidak,4,Positive
1657,mantap tenan,5,Positive
1658,min salah masukin nomer mau rubah nomer telepo...,5,Positive
1659,mantap,5,Positive


In [27]:
cleaned_dataset.to_csv("./data/processed_dataset.csv", index=False)