# Preprocessing Data - Erase Transform Load

## Import module / package

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("data/tweets.csv")
df.head(10)

Unnamed: 0,Datetime,Tweet Id,Tweet,Username
0,2022-04-24 03:11:13+00:00,1518064995912757250,Menangani Kekerasan Seksual Setelah RUU TPKS D...,SuluhSukabumi
1,2022-04-24 03:05:16+00:00,1518063494859718656,Menangani Kekerasan Seksual Setelah RUU TPKS D...,cacing_nagari
2,2022-04-24 02:53:08+00:00,1518060441787793408,Wakil Ketua MPR RI Minta Mahasiswa Kawal Imple...,redaksijakarta2
3,2022-04-24 02:45:40+00:00,1518058562286940160,Fadel Muhammad Minta Mahasiswa Kawal Implement...,kabargolkarnews
4,2022-04-24 02:17:01+00:00,1518051355419500544,"Lihat tanggal chatnya, kalau setelah April uda...",m_sael
5,2022-04-24 02:09:23+00:00,1518049431635263488,@convomf Kelakuan kayak gini bisa masuk ranah ...,leviosafa
6,2022-04-24 01:16:24+00:00,1518036099553144832,"@iyadehalpukat besar. Nah, UU TPKS ini membuka...",151172berylcand
7,2022-04-24 01:01:59+00:00,1518032470582898689,#Opini - Mochammad Abizar Yusro - Terang Sinar...,the_geotimes
8,2022-04-24 00:32:25+00:00,1518025030978146305,"Puan mengatakan, pengesahan UU TPKS menjadi un...",WendyHolye
9,2022-04-24 00:30:59+00:00,1518024671383662592,Ketua DPR RI Puan Maharani mendapat penghargaa...,WendyHolye


In [3]:
df.drop(["Datetime", "Tweet Id", "Username"], axis = 1, inplace = True)
df.columns = df.columns.str.strip().str.lower()
df.head(10)

Unnamed: 0,tweet
0,Menangani Kekerasan Seksual Setelah RUU TPKS D...
1,Menangani Kekerasan Seksual Setelah RUU TPKS D...
2,Wakil Ketua MPR RI Minta Mahasiswa Kawal Imple...
3,Fadel Muhammad Minta Mahasiswa Kawal Implement...
4,"Lihat tanggal chatnya, kalau setelah April uda..."
5,@convomf Kelakuan kayak gini bisa masuk ranah ...
6,"@iyadehalpukat besar. Nah, UU TPKS ini membuka..."
7,#Opini - Mochammad Abizar Yusro - Terang Sinar...
8,"Puan mengatakan, pengesahan UU TPKS menjadi un..."
9,Ketua DPR RI Puan Maharani mendapat penghargaa...


## Case Folding
Mengubah huruf kapital (uppercase) menjadi huruf kecil (lowercase) (library pandas)

In [4]:
df['tweet'] = df['tweet'].str.lower()
print('Hasil Case Folding : \n')
print(df['tweet'].head(10))
print('\n\n\n')

Hasil Case Folding : 

0    menangani kekerasan seksual setelah ruu tpks d...
1    menangani kekerasan seksual setelah ruu tpks d...
2    wakil ketua mpr ri minta mahasiswa kawal imple...
3    fadel muhammad minta mahasiswa kawal implement...
4    lihat tanggal chatnya, kalau setelah april uda...
5    @convomf kelakuan kayak gini bisa masuk ranah ...
6    @iyadehalpukat besar. nah, uu tpks ini membuka...
7    #opini - mochammad abizar yusro - terang sinar...
8    puan mengatakan, pengesahan uu tpks menjadi un...
9    ketua dpr ri puan maharani mendapat penghargaa...
Name: tweet, dtype: object






## Cleansing Data
Menghapus seluruh tanda baca, link, hashtag, mention

In [5]:
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    #remove url (menghapus link)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)(?:(?:\/[^\s/]))*', '', text)
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
df['tweet'] = df['tweet'].apply(remove_tweet_special)

#remove number (menghapus angka)
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['tweet'] = df['tweet'].apply(remove_number)

#remove punctuation (menghapus tanda baca)
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['tweet'] = df['tweet'].apply(remove_punctuation)

# remove single char (menghapus 1 karakter)
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['tweet'] = df['tweet'].apply(remove_singl_char)

#remove whitespace leading & trailing (menghapus spasi awal dan akhir)
def remove_whitespace_LT(text):
    return text.strip()

df['tweet'] = df['tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['tweet'] = df['tweet'].apply(remove_whitespace_multiple)

df.head(10)

Unnamed: 0,tweet
0,menangani kekerasan seksual setelah ruu tpks d...
1,menangani kekerasan seksual setelah ruu tpks d...
2,wakil ketua mpr ri minta mahasiswa kawal imple...
3,fadel muhammad minta mahasiswa kawal implement...
4,lihat tanggal chatnya kalau setelah april udah...
5,kelakuan kayak gini bisa masuk ranah uu tpks g...
6,besar nah uu tpks ini membuka sistem peradilan...
7,mochammad abizar yusro terang sinar uu tpks da...
8,puan mengatakan pengesahan uu tpks menjadi und...
9,ketua dpr ri puan maharani mendapat penghargaa...


## Tokenizing the Data
Membagi kalimat menjadi per kata / bagian-bagian kata

In [6]:
nltk.download('punkt')
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tweet'] = df['tweet'].apply(word_tokenize_wrapper)
df['tweet']

[nltk_data] Downloading package punkt to /home/anz007/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0        [menangani, kekerasan, seksual, setelah, ruu, ...
1        [menangani, kekerasan, seksual, setelah, ruu, ...
2        [wakil, ketua, mpr, ri, minta, mahasiswa, kawa...
3        [fadel, muhammad, minta, mahasiswa, kawal, imp...
4        [lihat, tanggal, chatnya, kalau, setelah, apri...
                               ...                        
15627    [koordinator, forum, perempuan, indonesia, ber...
15628    [puan, maharani, dinilai, penuhi, harapan, kau...
15629    [keberadaan, uu, tpks, kata, puan, maharani, a...
15630    [puan, menyebut, kehadiran, uu, tpks, nantinya...
15631    [ketua, dpr, ri, puan, maharani, menegaskan, r...
Name: tweet, Length: 15632, dtype: object

## Normalizing the Data
Mengubah kata yg tidak baku/kata yg disingkat menjadi kata baku

In [7]:
#normalizad_word = pd.read_excel("normalisasi.xlsx")
! pip install openpyxl
normalizad_word = pd.read_excel("data/normalisasi.xlsx"
                                ,engine='openpyxl')

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term 
            in normalizad_word_dict else term for term in document]

df['tweet'] = df['tweet'].apply(normalized_term)

df.head(10)

You should consider upgrading via the '/media/DATA/Apps-Files/Linux/Miniconda3/envs/data-mining/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

Unnamed: 0,tweet
0,"[menangani, kekerasan, seksual, setelah, ruu, ..."
1,"[menangani, kekerasan, seksual, setelah, ruu, ..."
2,"[wakil, ketua, mpr, ri, minta, mahasiswa, kawa..."
3,"[fadel, muhammad, minta, mahasiswa, kawal, imp..."
4,"[lihat, tanggal, chatnya, kalau, setelah, apri..."
5,"[kelakuan, kayak, gini, bisa, masuk, ranah, uu..."
6,"[besar, nah, uu, tpks, ini, membuka, sistem, p..."
7,"[mochammad, abizar, yusro, terang, sinar, uu, ..."
8,"[puan, mengatakan, pengesahan, uu, tpks, menja..."
9,"[ketua, dpr, ri, puan, maharani, mendapat, pen..."


In [8]:
df.to_csv('data/temp/tweets_bf_stpwrd.csv', index=False)

## Stopword Removal
Membuang kata yang tidak memiliki makna atau arti penting

In [9]:
nltk.download('stopwords')
list_stopwords = stopwords.words('indonesian')
list_stopwords.extend(['uu', 'tpks', 'uu tpks', 'ruu', 'uutpks', "puan", "maharani", "ketua", "dpr"])
txt_stopword = pd.read_csv("data/stopwords.txt", names= ["stopwords"], header = None)
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
list_stopwords = set(list_stopwords)
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]
df['tweet'] = df['tweet'].apply(stopwords_removal) 
df

[nltk_data] Downloading package stopwords to /home/anz007/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,tweet
0,"[menangani, kekerasan, seksual, disahkan, enam..."
1,"[menangani, kekerasan, seksual, disahkan, enam..."
2,"[wakil, mpr, ri, mahasiswa, kawal, implementasi]"
3,"[fadel, muhammad, mahasiswa, kawal, implementasi]"
4,"[tanggal, chatnya, april, dijerat]"
...,...
15627,"[koordinator, forum, perempuan, indonesia, ber..."
15628,"[dinilai, penuhi, harapan, kaum, perempuan]"
15629,"[keberadaan, payung, hukum, merehabilitasi, pe..."
15630,"[menyebut, kehadiran, wujud, keberpihakan, neg..."


In [10]:
bruto = int(len(df_tweets))
df_tweets.drop_duplicates(subset=['Tweet'])
print("Dataset dibuang (Karena duplikat) : "+(str(bruto-int(len(df_tweets))))+" data")
print("Dataset masuk : "+str(len(df_tweets))+" data")
df.to_csv('data/tweets_clean.csv', index=False)
df = pd.read_csv('data/tweets_clean.csv', encoding='latin1')
df.head(10)

Unnamed: 0,tweet
0,"['menangani', 'kekerasan', 'seksual', 'disahka..."
1,"['menangani', 'kekerasan', 'seksual', 'disahka..."
2,"['wakil', 'mpr', 'ri', 'mahasiswa', 'kawal', '..."
3,"['fadel', 'muhammad', 'mahasiswa', 'kawal', 'i..."
4,"['tanggal', 'chatnya', 'april', 'dijerat']"
5,"['kelakuan', 'gini', 'ranah']"
6,"['membuka', 'sistem', 'peradilan', 'diharapkan..."
7,"['mochammad', 'abizar', 'yusro', 'terang', 'si..."
8,"['pengesahan', 'undang undang', 'bentuk', 'had..."
9,"['ri', 'penghargaan', 'elemen', 'perempuan', '..."
