In [82]:
import torch
import numpy as np
import pandas as pd
import re
import os
from nltk.tokenize import WordPunctTokenizer
import itertools


In [83]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print("CUDA is available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
        print(f"Compute capability: {torch.cuda.get_device_capability(i)}")

PyTorch version: 2.5.1
CUDA version: 12.4
CUDA is available: True
Number of CUDA devices: 1
Device 0: NVIDIA GeForce GTX 1650
Compute capability: (7, 5)


## Load Data

In [84]:
EXTERNAL_DATA_PATH = './data/external'
RAW_DATA_PATH = './data/raw'
PROCESSED_DATA_PATH = './data/processed'

In [85]:
train_data = pd.read_csv(f'{RAW_DATA_PATH}/train.csv')
test_data = pd.read_csv(f'{RAW_DATA_PATH}/test.csv')

In [86]:
train_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...,kaskus,0,0,0,1
1,"@verosvante kita2 aja nitizen yang pada kepo,t...",instagram,0,0,0,0
2,"""#SidangAhok smg sipenista agama n ateknya mat...",twitter,0,1,1,1
3,@bolususulembang.jkt barusan baca undang2 ini....,instagram,0,0,0,0
4,bikin anak mulu lu nof \nkaga mikir apa kasian...,kaskus,0,0,0,0


In [87]:
test_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,"1.BUKAN CM SPANDUK PROF,VIDEO2 ORASI MEREKA, B...",twitter,0,0,1,0
1,@memeqbeceq gy sange'gatel yh tetek'memekY drn...,twitter,1,0,0,0
2,Pertama kali denger lagunya enk bgt in dan png...,instagram,0,0,0,0
3,"astajim, ini pasti yg kasih penghargaan ke ibu...",kaskus,0,0,0,0
4,beda kalo disini kalo komplain lgs di bully am...,kaskus,0,0,0,0


## Preprocess data

### mengartikan emot ke bahasa manusia yang mudah dipahami

In [88]:
EMOTICON_DATA_PATH = f'{EXTERNAL_DATA_PATH}/emoticon.txt'
emoticon_df = pd.read_csv(EMOTICON_DATA_PATH, sep='\t', header=None)
emoticon_dict = dict(zip(emoticon_df[0], emoticon_df[1]))

def translate_emoticon(t):
    for w, v in emoticon_dict.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern, t)
        if match:
            t = re.sub(pattern, v, t)
    return t

In [89]:
example_text = 'senang sekali berada disini :)'
example_text1 = 'saya sangat senang :@'

print(f'Before : {example_text}')
print(f'After : {translate_emoticon(example_text)}')

print(f'Before : {example_text1}')
print(f'After : {translate_emoticon(example_text1)}')

Before : senang sekali berada disini :)
After : senang sekali berada disini Senyum
Before : saya sangat senang :@
After : saya sangat senang Berteriak


### Remove excessive newline

In [90]:
def remove_newline(text):
    return re.sub('\n', ' ', text)


sample_text = 'halo saya\nadalah\nmahasiswa\ndi\ntelkom university\n'

print(f'Before : {sample_text}')
print(f'After : {remove_newline(sample_text)}')

Before : halo saya
adalah
mahasiswa
di
telkom university

After : halo saya adalah mahasiswa di telkom university 


### Remove kaskus formatting

In [91]:
def remove_kaskus_formatting(text):
    text = re.sub('\[', ' [', text)
    text = re.sub('\]', '] ', text)
    text = re.sub('\[quote[^ ]*\].*?\[\/quote\]', ' ', text)
    text = re.sub('\[[^ ]*\]', ' ', text)
    text = re.sub('"', ' ', text)
    return text

In [92]:
sample_text = '[quote]saya sangat senang[/quote] halo saya adalah mahasiswa di telkom university [quote]saya sangat senang[/quote]'
print(f'Before : {sample_text}')
print(f'After : {remove_kaskus_formatting(sample_text)}')

Before : [quote]saya sangat senang[/quote] halo saya adalah mahasiswa di telkom university [quote]saya sangat senang[/quote]
After :     halo saya adalah mahasiswa di telkom university    


### Remove url

In [93]:
def remove_url(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)

sample_text = 'website telkom university ada di http://www.telkomuniversity.ac.id'
print(f'Before : {sample_text}')
print(f'After : {remove_url(sample_text)}')

Before : website telkom university ada di http://www.telkomuniversity.ac.id
After : website telkom university ada di 


### Remove excessive whitespace

In [94]:
def remove_excessive_whitespace(text):
    return re.sub('  +', ' ', text)

In [95]:
sample_text = 'saya    sangat    senang sekali'
print(f'Before : {sample_text}')
print(f'After : {remove_excessive_whitespace(sample_text)}')

Before : saya    sangat    senang sekali
After : saya sangat senang sekali


### Tokenize text for word punctuation

In [96]:
def tokenize_text(text, punct=False):
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if punct or word.isalnum()]
    text = ' '.join(text)
    text = text.strip()
    return text

In [97]:
sample_text = 'kemarin, saya pergi ke mall, dan membeli baju baru!'

print(f'Before : {sample_text}')
print(f'After  : {tokenize_text(sample_text)}')

Before : kemarin, saya pergi ke mall, dan membeli baju baru!
After  : kemarin saya pergi ke mall dan membeli baju baru


### Transform kata slang

In [98]:
slang_words = pd.read_csv(f'{EXTERNAL_DATA_PATH}/slangword.csv')
slang_dict = dict(zip(slang_words['original'], slang_words['translated']))

def transform_slang_words(text):
    word_list = text.split()
    word_list_len = len(word_list)
    transformed_word_list = []
    i = 0
    while i < word_list_len:
        if (i + 1) < word_list_len:
            two_words = ' '.join(word_list[i:i+2])
            if two_words in slang_dict:
                transformed_word_list.append(slang_dict[two_words])
                i += 2
                continue
        transformed_word_list.append(slang_dict.get(word_list[i], word_list[i]))
        i += 1
    return ' '.join(transformed_word_list)

In [99]:
sample_text = 'siap mas sebentar lagi saya sampai 7an'

print(f'Before : {sample_text}')
print(f'After  : {transform_slang_words(sample_text)}')

Before : siap mas sebentar lagi saya sampai 7an
After  : siap mas sebentar lagi saya sampai tujuan


### Remove non alphabet

In [100]:
def remove_non_alphabet(text):
    output = re.sub('[^a-zA-Z ]+', '', text)
    return output

sample_text = 'kemaren tu123 ada kelinci di kebun'

print(f'Before : {sample_text}')
print(f'After  : {remove_non_alphabet(sample_text)}')

Before : kemaren tu123 ada kelinci di kebun
After  : kemaren tu ada kelinci di kebun


### Remove twitter & instagram formatting

In [101]:
def remove_twitter_ig_formatting(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'\brt\b', '', text)
    return text

sample_text = '@fakhri17: halo saya adalah mahasiswa di telkom university @fakhri17'
print(f'Before : {sample_text}')
print(f'After  : {remove_twitter_ig_formatting(sample_text)}')

Before : @fakhri17: halo saya adalah mahasiswa di telkom university @fakhri17
After  : : halo saya adalah mahasiswa di telkom university 


### Remove repeating characters

In [102]:
def remove_repeating_characters(text):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(text))

sample_text = 'haloooooooo saya adaaaaaaalahh mahasiswa di telkom university'
print(f'Before : {sample_text}')
print(f'After  : {remove_repeating_characters(sample_text)}')

Before : haloooooooo saya adaaaaaaalahh mahasiswa di telkom university
After  : halo saya adalah mahasiswa di telkom university


### Final preprocessing

In [None]:
def preprocess_text(text):
  transformed_text = text.lower()
  transformed_text = remove_newline(text)
  transformed_text = remove_url(transformed_text)
  transformed_text = remove_twitter_ig_formatting(transformed_text)
  transformed_text = remove_kaskus_formatting(transformed_text)
  transformed_text = translate_emoticon(transformed_text)
  transformed_text = transformed_text.lower()
  transformed_text = tokenize_text(transformed_text)
  transformed_text = transform_slang_words(transformed_text)
  transformed_text = remove_repeating_characters(transformed_text)
  transformed_text = transform_slang_words(transformed_text)
  transformed_text = remove_non_alphabet(transformed_text)
  transformed_text = remove_excessive_whitespace(transformed_text)
  transformed_text = transformed_text.lower().strip()
  return transformed_text


sample_text = '@fakhri17: halooooooo saya adalah mahasiswa di telkom university surabaya :)'
print(f'Before : {sample_text}')
print(f'After  : {preprocess_text(sample_text)}')

Before : kemarin tu123 ada kelinci di kebun @fakhri17: halo saya adalah mahasiswa di telkom university surabaya :)
After  : kemarin tu ada kelinci di kebun halo saya adalah mahasiswa di telkom university surabaya senyum
