In [18]:
import pandas as pd 
import numpy as np

LOAD_DATA = pd.read_csv("comments.csv", encoding = "ISO-8859-1")
LOAD_DATA['comment'].str.encode('ascii', 'ignore')
LOAD_DATA.head()

Unnamed: 0,username,comment
0,kepek_mabur,Ini baguss sihh keren pemandangan nya ya gak s...
1,primahermanto,@vebeeandyany Ayoooo gasss..
2,happshapist,@kikideva_ Monggo sek kemarin tanya2
3,yuliana_iin,kii lhoo @siscaherning
4,afiq5922,@umami_resa


In [19]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
LOAD_DATA['comment'] = LOAD_DATA['comment'].str.lower()


print('Case Folding Result : \n')
print(LOAD_DATA['comment'].head(5))
print('\n\n\n')

Case Folding Result : 

0    ini baguss sihh keren pemandangan nya ya gak s...
1                         @vebeeandyany ayoooo gasss..
2                 @kikideva_ monggo sek kemarin tanya2
3                               kii lhoo @siscaherning
4                                          @umami_resa
Name: comment, dtype: object






In [20]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
LOAD_DATA['comment'] = LOAD_DATA['comment'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

LOAD_DATA['comment'] = LOAD_DATA['comment'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

LOAD_DATA['comment'] = LOAD_DATA['comment'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

LOAD_DATA['comment'] = LOAD_DATA['comment'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

LOAD_DATA['comment'] = LOAD_DATA['comment'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

LOAD_DATA['comment'] = LOAD_DATA['comment'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

LOAD_DATA['comment_tokens'] = LOAD_DATA['comment'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(LOAD_DATA['comment_tokens'].head())
print('\n\n\n')

Tokenizing Result : 

0    [ini, baguss, sihh, keren, pemandangan, nya, y...
1                                      [ayoooo, gasss]
2                        [monggo, sek, kemarin, tanya]
3                                          [kii, lhoo]
4                                               [resa]
Name: comment_tokens, dtype: object






In [21]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

LOAD_DATA['comment_tokens_fdist'] = LOAD_DATA['comment_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(LOAD_DATA['comment_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(ini, 1), (baguss, 1), (sihh, 1), (keren, 1),...
1                            [(ayoooo, 1), (gasss, 1)]
2    [(monggo, 1), (sek, 1), (kemarin, 1), (tanya, 1)]
3                                [(kii, 1), (lhoo, 1)]
4                                          [(resa, 1)]
Name: comment_tokens_fdist, dtype: object


In [22]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')
print(len(list_stopwords))

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
len(list_stopwords)
# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
len(list_stopwords)
# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

LOAD_DATA['comment_tokens_WSW'] = LOAD_DATA['comment_tokens'].apply(stopwords_removal) 


print(LOAD_DATA['comment_tokens_WSW'].head())

758
0    [baguss, sihh, keren, pemandangan]
1                       [ayoooo, gasss]
2                     [monggo, kemarin]
3                           [kii, lhoo]
4                                [resa]
Name: comment_tokens_WSW, dtype: object


# Normalisasi

In [24]:
normalizad_word = pd.read_excel("Normalisasi.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

LOAD_DATA['comment_normalized'] = LOAD_DATA['comment_tokens_WSW'].apply(normalized_term)

LOAD_DATA['comment_normalized'].head(10)

0    [bagus, sihh, keren, pemandangan]
1                      [ayoooo, gasss]
2                    [monggo, kemarin]
3                          [kii, lhoo]
4                               [resa]
5                       [nama, tempat]
6                               [ehem]
7                               [zain]
8            [adminnya, kesana, belom]
9                                   []
Name: comment_normalized, dtype: object

In [25]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in LOAD_DATA['comment_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

LOAD_DATA['comment_tokens_stemmed'] = LOAD_DATA['comment_normalized'].swifter.apply(get_stemmed_term)
print(LOAD_DATA['comment_tokens_stemmed'])

74
------------------------
bagus : bagus
sihh : sihh
keren : keren
pemandangan : pandang
ayoooo : ayoooo
gasss : gasss
monggo : monggo
kemarin : kemarin
kii : kii
lhoo : lhoo
resa : resa
nama : nama
tempat : tempat
ehem : ehem
zain : zain
adminnya : adminnya
kesana : kesana
belom : bom
merapat : rapat
makasih : makasih
info : info
nongkrong : nongkrong
tanjung : tanjung
harjo : harjo
syahdu : syahdu
poll : poll
donggg : donggg
gaa : gaa
pan : pan
blenda : blenda
puncak : puncak
saka : saka
nanggulan : nanggulan
mung : mung
duwur : duwur
omah : omah
yk : yk
cuzz : cuzz
besok : besok
goleki : golek
wah : wah
kuat : kuat
ngonthel : ngonthel
apriyawan : apriyawan
feriawan : feriawan
putra : putra
mantep : mantep
ayok : ayok
spedaan : spedaan
tekan : tekan
keneh : keneh
waaah : waaah
dateng : dateng
nit : nit
cobain : cobain
kuy : kuy
adisusanta : adisusanta
ikiii : ikiii
oii : oii
banget : banget
viewnya : viewnya
mari : mari
berpetualang : tualang
seru : seru
merapi : rapi
semoga : moga


Pandas Apply:   0%|          | 0/39 [00:00<?, ?it/s]

0                         [bagus, sihh, keren, pandang]
1                                       [ayoooo, gasss]
2                                     [monggo, kemarin]
3                                           [kii, lhoo]
4                                                [resa]
5                                        [nama, tempat]
6                                                [ehem]
7                                                [zain]
8                               [adminnya, kesana, bom]
9                                                    []
10    [rapat, makasih, info, nongkrong, tanjung, harjo]
11                                                   []
12                                       [syahdu, poll]
13                                             [donggg]
14                                           [gaa, pan]
15                                             [blenda]
16                                                   []
17            [puncak, saka, tanjung, harjo, nan

In [26]:
LOAD_DATA.to_csv("Text_Preprocessing.csv")

In [27]:
LOAD_DATA.to_excel("Text_Preprocessing.xlsx")

In [28]:
LOAD_DATA.to_hdf("Text_Preprocessing.h5", 'comment', mode='w')

ImportError: Missing optional dependency 'tables'.  Use pip or conda to install tables.