# Load Data

In [1]:
import pandas as pd 
import numpy as np
import nltk

In [3]:
dataSB = pd.read_csv('PERCUMALAPORPOLISI1.csv') #lokasi file

dataSB.head()

Unnamed: 0,clean_text
0,jadi
1,kasus tppo mandeg dikepolisian simak di kepoli...
2,pantesan muncul tagar
3,untuk ini saya akui mental anda hebat bung say...
4,jangan sampai terjadi lagi


In [4]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas

dataSB['data_bersih'] = dataSB['clean_text'].str.lower()

print('Case Folding Result : \n')
print(dataSB['data_bersih'].head(5))

Case Folding Result : 

0                                                 jadi
1    kasus tppo mandeg dikepolisian simak di kepoli...
2                                pantesan muncul tagar
3    untuk ini saya akui mental anda hebat bung say...
4                          jangan sampai  terjadi lagi
Name: data_bersih, dtype: object


# Tokenizing

Menghapus kata yang tidak penting

In [5]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

In [6]:
nltk.download('punkt')

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
dataSB['data_bersih'] = dataSB['data_bersih'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

dataSB['data_bersih'] = dataSB['data_bersih'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

dataSB['data_bersih'] = dataSB['data_bersih'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

dataSB['data_bersih'] = dataSB['data_bersih'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

dataSB['data_bersih'] = dataSB['data_bersih'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

dataSB['data_bersih'] = dataSB['data_bersih'].apply(remove_singl_char)

# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

dataSB['data_tokens'] = dataSB['data_bersih'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(dataSB['data_tokens'].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Tokenizing Result : 

0                                               [jadi]
1    [kasus, tppo, mandeg, dikepolisian, simak, di,...
2                            [pantesan, muncul, tagar]
3    [untuk, ini, saya, akui, mental, anda, hebat, ...
4                      [jangan, sampai, terjadi, lagi]
Name: data_tokens, dtype: object


In [7]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

dataSB['data_tokens_fdist'] = dataSB['data_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(dataSB['data_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0                                          [(jadi, 1)]
1    [(kasus, 1), (tppo, 1), (mandeg, 1), (dikepoli...
2             [(pantesan, 1), (muncul, 1), (tagar, 1)]
3    [(saya, 2), (untuk, 1), (ini, 1), (akui, 1), (...
4    [(jangan, 1), (sampai, 1), (terjadi, 1), (lagi...
Name: data_tokens_fdist, dtype: object


# Stopwords

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'hm',
                       '&amp', 'yah'])

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataSB['data_tokens_WSW'] = dataSB['data_tokens'].apply(stopwords_removal) 

print(dataSB['data_tokens_WSW'].head())

0                                                   []
1    [tppo, mandeg, dikepolisian, simak, kepolisian...
2                            [pantesan, muncul, tagar]
3    [akui, mental, hebat, berkomentar, kadang, get...
4                                                   []
Name: data_tokens_WSW, dtype: object


# Normalization

menganti kata-kata tertentu

In [11]:
normalizad_word = pd.read_excel('slang.xlsx') #lokasi file

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

dataSB['textdata_normalized'] = dataSB['data_tokens_WSW'].apply(normalized_term)

dataSB['textdata_normalized'].head(10)

0                                                   []
1    [tppo, mandeg, dikepolisian, simak, kepolisian...
2                            [pantesan, muncul, tagar]
3    [akui, mental, hebat, berkomentar, kadang, get...
4                                                   []
5                                                   []
6                                     [nasib, bharada]
7                        [dimakan, usia, kabar, orang]
8    [benarbenar, kecewa, aparat, trauma, korban, t...
9                                                 [cc]
Name: textdata_normalized, dtype: object

In [15]:
!pip install Sastrawi
!pip install swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.3.4.tar.gz (830 kB)
[K     |████████████████████████████████| 830 kB 5.2 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (291 kB)
[K     |████████████████████████████████| 291 kB 74.9 MB/s 
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 53.4 MB/s 
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.3.4-py3-none-any.whl size=16322 sha256=4c410e1349347340dc5a4905793cca931ee1799deadf295591e71fa776fba1b7
  Stored in directory: /root/.cache/pip/wheels/29/a7/0e/

In [16]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in dataSB['textdata_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))

3460


In [17]:
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    
    # untuk melihat hasilnya silahkan jalankan baris di bawah ini
    # print(term,":" ,term_dict[term])

In [18]:
# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

dataSB['textdata_tokens_stemmed'] = dataSB['textdata_normalized'].swifter.apply(get_stemmed_term)

print(dataSB['textdata_tokens_stemmed'])

Pandas Apply:   0%|          | 0/1065 [00:00<?, ?it/s]

0                                                      []
1            [tppo, mandeg, polisi, simak, polisi, patut]
2                               [pantesan, muncul, tagar]
3       [aku, mental, hebat, komentar, kadang, getar, ...
4                                                      []
                              ...                        
1060    [sumpahkami, kesini, neg, lihat, intitusi, dib...
1061    [sumpahkami, kesini, neg, lihat, intitusi, dib...
1062    [tindak, tidak, benar, aneh, badut, buzzrp, da...
1063                                     [tidak, percaya]
1064    [saya, lsporkan, hubung masyarakat, polri, cci...
Name: textdata_tokens_stemmed, Length: 1065, dtype: object


In [19]:
#stopwords #2

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["ada", "tan", "ton", "pt", "komentar", "juta", "unit", "menang", "artikel", 
                       "smartphone", "tagar", "sedia", "kaskus", "seksi"])

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataSB['textdata_tokens_stemmed2'] = dataSB['textdata_tokens_stemmed'].apply(stopwords_removal) 

print(dataSB['textdata_tokens_stemmed2'].head())

0                                              []
1    [tppo, mandeg, polisi, simak, polisi, patut]
2                              [pantesan, muncul]
3    [mental, hebat, kadang, getar, getir, hahah]
4                                              []
Name: textdata_tokens_stemmed2, dtype: object


In [20]:
for i in range(len(dataSB)):
        a=dataSB.iloc[i][6]
        document.append(a)
        
document[0:5]

['saya', 'lsporkan', 'hubungan masyarakat', 'polri', 'ccip']

In [21]:
doc_clean = dataSB['textdata_tokens_stemmed2']
doc_clean

0                                                      []
1            [tppo, mandeg, polisi, simak, polisi, patut]
2                                      [pantesan, muncul]
3            [mental, hebat, kadang, getar, getir, hahah]
4                                                      []
                              ...                        
1060    [sumpahkami, kesini, neg, lihat, intitusi, dib...
1061    [sumpahkami, kesini, neg, lihat, intitusi, dib...
1062    [tindak, aneh, badut, buzzrp, dalih, doxxing, ...
1063                                            [percaya]
1064    [lsporkan, hubung masyarakat, polri, ccip, pol...
Name: textdata_tokens_stemmed2, Length: 1065, dtype: object

# Save CSV

In [22]:
doc_clean.to_csv("FERDYSMBO2.csv",index=False)