In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nlp_id.lemmatizer import Lemmatizer
from langdetect import detect
from deep_translator import GoogleTranslator

In [2]:
# Import the twitter scrap file 'dapildki3.csv'
twitscrap = pd.read_csv('assets/data_folder/twitterscraped.csv', encoding='latin-1')

# Import dictionary file 'kamus_alay.csv' for non-formal words mapping
kamusalay = pd.read_csv('assets/dicts/kamus_alay.csv', encoding='latin-1', header=None)
# Rename the header for kamus_alay
kamusalay = kamusalay.rename(columns={0: 'original', 1: 'replacement'})
# Create a dictionary mapping non-formal words to their formal counterparts
kamusalay_map = dict(zip(kamusalay['original'], kamusalay['replacement']))

In [3]:
# Define a series of text pre-processing functions

def lowercase(text):
    return text.lower() # converts all text to lowercase

def remove_unnecessary_char(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove URLs
    text = re.sub(r'@\w+', ' ', text) # Remove Twitter usernames
    text = re.sub(r'#\w+', ' ', text) # Remove Twitter hashtags
    text = re.sub('  +', ' ', text) # Remove multiple spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)  # Remove any non-alphanumeric characters
    return text

def normalize_alay(text):
    return ' '.join([kamusalay_map[word] if word in kamusalay_map else word for word in text.split(' ')]) # Replace non-formal words

# Create an indonesian stopword object
id_stopword = nltk.corpus.stopwords.words('indonesian')

# Function to remove words from the texts
def remove_words(text):
    text = ' '.join(['' if word in id_stopword else word for word in text.split(' ')]) # Remove stopwords 
    # The 2 lines of code below is supposed to be a seperate function, but this works so dont bother
    text = re.sub('  +', ' ', text) # Remove multiple spaces
    text = text.strip()  # Remove trailing spaces
    return text

# Create a lemmatizer object
lemmatizer = Lemmatizer()
def stemming(text):
    return lemmatizer.lemmatize(text) # Lemmatizes texts

In [4]:
# Define the main preprocess function
def preprocess(text):
    text = remove_unnecessary_char(text)
    text = remove_nonaplhanumeric(text)
    text = stemming(text)
    text = lowercase(text)
    text = remove_words(text)
    text = normalize_alay(text)
    return text

In [5]:
# Apply the preprocess functions to the raw text files
twitscrap['Text_Processed'] = twitscrap['Text'].apply(preprocess)

# Additional cleanups after the preprocess
twitscrap = twitscrap.replace(r'^\s*$', np.nan, regex=True)
twitscrap = twitscrap.drop_duplicates(['Text_Processed'])
twitscrap = twitscrap.dropna()
twitscrap = twitscrap.reset_index(drop=True)

In [6]:
lang = []

# Detect the language of the processed texts
for i in twitscrap['Text_Processed']:
    try:
        lan = detect(i)
    except:
        lan = np.nan
    lang.append(lan)

In [7]:
# Insert a new column containing the language of each texts
twitscrap['Language'] = lang
twitscrap = twitscrap.dropna()

In [8]:
# Remove the columns that are not indonesian
twitscrap_id = twitscrap[twitscrap['Language']=='id']
twitscrap_id = twitscrap_id.reset_index(drop=True)

In [9]:
# Turn the 'Text_Processed' column into a list
texts = twitscrap_id['Text_Processed'].tolist()

# Define the batch size
batch_size = 500

# Break up the list into batches
batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]

# Translate each batch separately (for performance)
translated = []
for batch in batches:
    translated_batch = GoogleTranslator('id', 'en').translate_batch(batch)
    translated.extend(translated_batch)

In [10]:
# Create a temporary dataframe for the translated texts
df = pd.DataFrame({'tl': translated})

# Insert a new column containing the translated texts
twitscrap_id['Text_Translated'] = df['tl']

In [11]:
twitscrap_id.to_csv('assets/data_folder/twittercleaned.csv', index=False)

In [12]:
twitscrap_id

Unnamed: 0,Datetime,Text,Keyword,Text_Processed,Language,Text_Translated
0,2015-12-28 18:00:42+00:00,Mencari perbandingan RUU KUHP &amp; KUHP!,RUU KUHP,cari banding rancangan undang undang kuhp amp ...,id,search for appeals of the draft law on the Cri...
1,2015-12-23 02:03:39+00:00,Stadium General: Perkembangan dan Masa Depan P...,RUU KUHP,stadium general kembang susun rancangan undang...,id,General Kembang Stadium drafts the Criminal Co...
2,2015-12-20 05:04:17+00:00,Rechtelijk Pardon di #RUUKUHP tidak tepat dite...,RUU KUHP,rechtelijk pardon terjemah maaf adil terjemah ...,id,"rechtelijk pardon translation of sorry, fair, ..."
3,2015-12-19 10:05:37+00:00,Kapan RUU KUHP kita di sahkan????,RUU KUHP,rancangan undang undang kuhp sahkan,id,the draft criminal code law is ratified
4,2015-12-19 09:48:27+00:00,"@SoundOfYogi udah ga ada mit, dicabut mk, tapi...",RUU KUHP,sudah tidak mit pergi mk rancangan undang unda...,id,I can't go anymore now that the draft Criminal...
...,...,...,...,...,...,...
6413,2023-02-13 08:26:22+00:00,@ediwanqu Betul...jangan sampai menunggu revis...,revisi RKUHP,tunggu revisi undang undang rkuhp laku 3 lagi,id,Wait for the revision of the RKUHP law to take...
6414,2023-02-06 17:45:13+00:00,Yang jelas akan melebihi apa yang terjadi dala...,revisi RKUHP,aksi tolak rkuhp 2019 revisi undang undang kom...,id,"action to reject the 2019 RKUHP, revision of t..."
6415,2023-01-29 03:00:30+00:00,Revisi RKUHP tidak mudah tapi tetap kita dukun...,revisi RKUHP,revisi rkuhp mudah dukung rkuhp adil rfbrsltbmo,id,Easy RKUHP revision supports fair RKUHP RFBRSL...
6416,2023-01-13 06:38:09+00:00,KUHP Baru Sangat Baik.. Revisi RKUHP tidak mud...,revisi RKUHP,kuhp revisi rkuhp mudah dukung,id,The revised Criminal Code is easy to support
