### Import Modules

In [48]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from itertools import groupby, islice

Run the code below if the stopwords list haven't been downloaded

In [26]:
# import nltk
# nltk.download('stopwords')

### Load The Dataset
<a href="https://www.kaggle.com/datasets/bondanvitto/indonesia-twitter-comment-labeled-with-ite-law">Dataset Link</a>

In [27]:
df = pd.read_csv('data/Dataset Twitter Fix - Indonesian Sentiment Twitter Dataset Labeled.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12647 entries, 0 to 12646
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sentimen  12646 non-null  float64
 1   Tweet     12647 non-null  object 
dtypes: float64(1), object(1)
memory usage: 197.7+ KB


### Checking for null values

In [28]:
df.isna().sum()

sentimen    1
Tweet       0
dtype: int64

### Penggantian nilai kosong the modus

In [29]:
# Import Module
from sklearn.impute import SimpleImputer

# Pembuatan Imputation dengan pengganti modus
impute_modes = SimpleImputer(strategy='most_frequent')

df['sentimen'] = impute_modes.fit_transform(df[['sentimen']])

In [30]:
# Checking for null values
df.isna().sum()

sentimen    0
Tweet       0
dtype: int64

### Mengambil data hanya yang berasal dari 2 kelas
2 Kelas tersebut adalah:
 - 1 = Sentimen Positif
 - 2 = Sentimen Negatif

In [31]:
df = df.loc[df['sentimen'].isin((1.0, 2.0))]

### Load The Supplementary Datasets (Nggak Jadi)
<a href="https://www.kaggle.com/datasets/dennisherdi/indonesian-twitter-emotion">Dataset Link</a>

In [32]:
# df_supplements = pd.read_csv('data/Twitter_Emotion_Dataset.csv')
# df_supplements.info()

In [33]:
# df_supplements.isna().sum()

In [34]:
# df_supplements.rename(columns={'label': 'sentimen', 'tweet': 'Tweet'}, inplace=True)
# df_supplements = df_supplements.loc[df_supplements['sentimen'].isin(('happy', 'love'))]
# df_supplements = df_supplements.assign(sentimen=1.0)
# df_supplements.head()

In [35]:
# print(f"Original Data Frame Length = {len(df)}")
# print(f"Supplements Data Frame Length = {len(df_supplements)}")

In [36]:
# df = pd.concat([df, df_supplements])

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6980 entries, 5327 to 12607
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sentimen  6980 non-null   float64
 1   Tweet     6980 non-null   object 
dtypes: float64(1), object(1)
memory usage: 163.6+ KB


In [38]:
df.duplicated().sum()

316

In [39]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6664 entries, 5327 to 12607
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sentimen  6664 non-null   float64
 1   Tweet     6664 non-null   object 
dtypes: float64(1), object(1)
memory usage: 156.2+ KB


In [41]:
tweets = df['Tweet']
tweets.head()

5327    doa rezeki tak putus inna haa zaa larizquna ma...
5328    makasih loh ntar kita bagi hasil aku 99 9 sisa...
5329    ya aku akan menjadi satu satunya bukan nomor s...
5330    i dont know why but these zikir sangat membant...
5331              aah kamu aja mas aku lebih suka diayomi
Name: Tweet, dtype: object

### Berbagai Pre-processing

In [42]:
def custom_standardisation(input_data: str):
    # remove tag
    input_data = re.sub(r"(URL)|(USER)", '', input_data)
    # remove unicode and newline
    input_data = re.sub(r"[\\]x\w{2}|'|\\n", '', input_data)
    # remove non alphanumeric character
    input_data = re.sub(r'[^a-zA-Z ]+', '', input_data)
    # remove single char
    input_data = re.sub(r'\b[a-zA-Z]\b', '', input_data)
    # remove repeated words
    input_data = " ".join([k for k,v in groupby(input_data.split())])
    # remove wkwkwkwkwkwk
    input_data =  re.sub(r'(k)?(wk){2,}(w)?', 'wk', input_data)
    # remove hehehehehehe
    input_data =  re.sub(r'(e)?(he){2,}(h)?', 'he', input_data)
    # fold case
    input_data = input_data.casefold()
    # remove whitespace in front and in the back
    input_data = input_data.strip()
    # remove multiple spaces
    input_data = re.sub(r' {2,}', ' ', input_data)
    return input_data

tweets = tweets.apply(custom_standardisation)
tweets.head()

5327    doa rezeki tak putus inna haa zaa larizquna ma...
5328    makasih loh ntar kita bagi hasil aku sisanya b...
5329    ya aku akan menjadi satu satunya bukan nomor s...
5330    dont know why but these zikir sangat membantu ...
5331              aah kamu aja mas aku lebih suka diayomi
Name: Tweet, dtype: object

In [43]:
custom_standardisation("Hello Hello my my my name name is gilang raditya")

'hello wkwkwkwkwkwk my name is gilang raditya'

### Ngembangkan Kata Dari Singkatan

In [44]:
singkatan_df = pd.read_csv('kamus_singkatan.csv', index_col='singkatan')
singkatan_dict = singkatan_df.iloc[:, 1].to_dict()

def expand_abbr(text: str):
    return ' '.join((singkatan_dict.get(word, word) for word in text.split()))

tweets = tweets.apply(expand_abbr)
tweets.head()

5327    doa rezeki tak putus inna haa zaa larizquna ma...
5328    terima kasih loh nanti kita bagi hasil saya si...
5329    ya saya akan menjadi satu satunya bukan nomor ...
5330    dont know why but these zikir sangat membantu ...
5331            aah kamu saja mas saya lebih suka diayomi
Name: Tweet, dtype: object

### Ngubah Kata Jadi Kata Dasar

In [45]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_wrapper(text: str):
    return stemmer.stem(text)

for i, tweet in enumerate(tweets):
    tweets[i] = stem_wrapper(tweet)
    if i%50 == 0:
        print(i, end=', ')

tweets.head()

0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000, 2050, 2100, 2150, 2200, 2250, 2300, 2350, 2400, 2450, 2500, 2550, 2600, 2650, 2700, 2750, 2800, 2850, 2900, 2950, 3000, 3050, 3100, 3150, 3200, 3250, 3300, 3350, 3400, 3450, 3500, 3550, 3600, 3650, 3700, 3750, 3800, 3850, 3900, 3950, 4000, 4050, 4100, 4150, 4200, 4250, 4300, 4350, 4400, 4450, 4500, 4550, 4600, 4650, 4700, 4750, 4800, 4850, 4900, 4950, 5000, 5050, 5100, 5150, 5200, 5250, 5300, 5350, 5400, 5450, 5500, 5550, 5600, 5650, 5700, 5750, 5800, 5850, 5900, 5950, 6000, 6050, 6100, 6150, 6200, 6250, 6300, 6350, 6400, 6450, 6500, 6550, 6600, 6650, 

5327    titik air mata saya bayang betapa terseksa kuc...
5328    kalau bnda itu jadi lagi saya mals deh ingin rayu
5329                complicated betul saya usha dak ptptn
5330    harus unfollow semua olshop yang seliwer di ig...
5331                      saya pamit dan jangan cari saya
Name: Tweet, dtype: object

In [49]:
stopwords_list = stopwords.words('indonesian')

# convert list to dictionary
stopwords_set = set(stopwords_list)
for i in islice(stopwords_set, 10):
    print(i, end=", ")

berkali-kali, ibaratnya, menyeluruh, paling, kesampaian, misalkan, apatah, berapakah, malahan, akhir, 

In [50]:
#remove stopword pada list token
def stopwords_removal(words):
    return ''.join([word for word in words if word not in stopwords_set])


tweets = tweets.apply(stopwords_removal)
tweets.head()

5327    titik air mata saya bayang betapa terseksa kuc...
5328    kalau bnda itu jadi lagi saya mals deh ingin rayu
5329                complicated betul saya usha dak ptptn
5330    harus unfollow semua olshop yang seliwer di ig...
5331                      saya pamit dan jangan cari saya
Name: Tweet, dtype: object

In [51]:
df_backup = df.copy()
tweets_backup = tweets.copy().reset_index(drop=True)
df_backup = df_backup.assign(Tweet=tweets_backup )
df_backup.to_csv('data/cleaned_twitter.csv', index=False)