In [12]:
import torch
import numpy as np
import pandas as pd
import re
import os

In [4]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print("CUDA is available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
        print(f"Compute capability: {torch.cuda.get_device_capability(i)}")

PyTorch version: 2.5.1
CUDA version: 12.4
CUDA is available: True
Number of CUDA devices: 1
Device 0: NVIDIA GeForce GTX 1650
Compute capability: (7, 5)


## Load Data

In [17]:
EXTERNAL_DATA_PATH = './data/external'
RAW_DATA_PATH = './data/raw'
PROCESSED_DATA_PATH = './data/processed'

In [18]:
train_data = pd.read_csv(f'{RAW_DATA_PATH}/train.csv')
test_data = pd.read_csv(f'{RAW_DATA_PATH}/test.csv')

In [19]:
train_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...,kaskus,0,0,0,1
1,"@verosvante kita2 aja nitizen yang pada kepo,t...",instagram,0,0,0,0
2,"""#SidangAhok smg sipenista agama n ateknya mat...",twitter,0,1,1,1
3,@bolususulembang.jkt barusan baca undang2 ini....,instagram,0,0,0,0
4,bikin anak mulu lu nof \nkaga mikir apa kasian...,kaskus,0,0,0,0


In [20]:
test_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,"1.BUKAN CM SPANDUK PROF,VIDEO2 ORASI MEREKA, B...",twitter,0,0,1,0
1,@memeqbeceq gy sange'gatel yh tetek'memekY drn...,twitter,1,0,0,0
2,Pertama kali denger lagunya enk bgt in dan png...,instagram,0,0,0,0
3,"astajim, ini pasti yg kasih penghargaan ke ibu...",kaskus,0,0,0,0
4,beda kalo disini kalo komplain lgs di bully am...,kaskus,0,0,0,0


## Preprocess data

### mengartikan emot ke bahasa manusia yang mudah dipahami

In [27]:
EMOTICON_DATA_PATH = f'{EXTERNAL_DATA_PATH}/emoticon.txt'
emoticon_df = pd.read_csv(EMOTICON_DATA_PATH, sep='\t', header=None)
emoticon_dict = dict(zip(emoticon_df[0], emoticon_df[1]))

def translate_emoticon(t):
    for w, v in emoticon_dict.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern, t)
        if match:
            t = re.sub(pattern, v, t)
    return t

In [29]:
example_text = 'senang sekali berada disini :)'
example_text1 = 'saya sangat senang :@'

print(f'Before : {example_text}')
print(f'After : {translate_emoticon(example_text)}')

print(f'Before : {example_text1}')
print(f'After : {translate_emoticon(example_text1)}')

Before : senang sekali berada disini :)
After : senang sekali berada disini Senyum
Before : saya sangat senang :@
After : saya sangat senang Berteriak


### Remove excessive newline

In [31]:
def remove_newline(text):
    return re.sub('\n', ' ', text)


sample_text = 'halo saya\nadalah\nmahasiswa\ndi\ntelkom university\n'

print(f'Before : {sample_text}')
print(f'After : {remove_newline(sample_text)}')

Before : halo saya
adalah
mahasiswa
di
telkom university

After : halo saya adalah mahasiswa di telkom university 


### Remove kaskus formatting

In [32]:
def remove_kaskus_formatting(text):
    text = re.sub('\[', ' [', text)
    text = re.sub('\]', '] ', text)
    text = re.sub('\[quote[^ ]*\].*?\[\/quote\]', ' ', text)
    text = re.sub('\[[^ ]*\]', ' ', text)
    text = re.sub('"', ' ', text)
    return text

In [None]:
sample_text = '[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo semuanya[/QUOTE'
print(f'Before : {sample_text}')
print(f'After : {remove_kaskus_formatting(sample_text)}')

Before : [quote]saya sangat senang[/quote] halo saya adalah mahasiswa di telkom university [quote]saya sangat senang[/quote]
After :     halo saya adalah mahasiswa di telkom university    
