In [1]:
import pandas as pd
import re
import regex            
import unicodedata
import sys
import time
from tqdm.auto import tqdm
try:
    import emoji
except ImportError:
    raise ImportError("Library 'emoji' belum terpasang. Jalankan: pip install emoji")

df = pd.read_csv('DataFix.csv', encoding='utf-8')

ASCII_PATTERN = r'[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]'
ascii_regex   = re.compile(ASCII_PATTERN)

ascii_counts = {}
emoji_counts = {}

for text in df['text'].astype(str):
    for em in ascii_regex.findall(text):
        ascii_counts[em] = ascii_counts.get(em, 0) + 1
    for grapheme in regex.findall(r'\X', text):
        if any(unicodedata.category(ch).startswith('So') for ch in grapheme):
            if not ascii_regex.fullmatch(grapheme):
                emoji_counts[grapheme] = emoji_counts.get(grapheme, 0) + 1

ascii_df = pd.DataFrame(
    sorted(ascii_counts.items(), key=lambda x: -x[1]),
    columns=['emotikon', 'count']
)
emoji_df = pd.DataFrame(
    sorted(emoji_counts.items(), key=lambda x: -x[1]),
    columns=['emoji', 'count']
)

pd.set_option('display.max_rows',    None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width',       None)
pd.set_option('display.max_colwidth', None)

print("Daftar Emotikon ASCII:")
print(ascii_df.to_string(index=False))
print("\nDaftar Emoji Unicode:")
print(emoji_df.to_string(index=False))


Daftar Emotikon ASCII:
emotikon  count
      :/    171
      8)     10
      :(      8
      :)      6
      8/      5
      8:      2
      =P      1
     8')      1

Daftar Emoji Unicode:
   emoji  count
       🔥   2172
       😂    532
       🙌    509
       😍    495
       👏    377
      ❤️    238
       🙏    157
       😢    151
       💛    149
       🤣    110
       😁    105
       😭     91
      🙏🏻     77
       💙     67
       🤩     45
       🥳     43
       😊     41
       👍     41
       🤗     41
       🥹     40
       💪     39
       😅     39
      🙌🏻     30
       🤲     29
       🥰     27
       😎     27
       🥲     26
      🙏🏼     25
       🫡     24
       ✨     23
      ⚡️     23
       😆     20
      🫶🏻     20
       😇     20
      🙌🏼     20
      💪🏻     19
       🤭     19
       ❤     19
       😄     16
       😮     16
       🫣     15
       😌     15
      👍🏻     13
       😀     13
       🎉     12
      💪🏼     12
      🤲🏻     11
       🙈     11
       💨     10
       🧡  

In [2]:
df['text_clean'] = df['text'].fillna('').astype(str)

emotikon_map = {
    ':/':   ' bingung ',
    '8)':   ' keren ',
    ':(':   ' sedih ',
    ':)':   ' senang ',
    '8/':   ' bingung ringan ',
    '8:':   ' kagum ',
    '=P':   ' julurkan bahasa ',
    "8')":  ' terharu ',
}
escaped_map = {re.escape(k): v for k, v in emotikon_map.items()}
df['text_clean'] = df['text_clean'].replace(escaped_map, regex=True)

def emojify_to_words(s: str) -> str:
    w = emoji.demojize(s, language='en')
    w = w.replace(":", " ").replace("_", " ")
    w = re.sub(r" light skin tone| medium skin tone| dark skin tone", "", w)
    return w

df['text_clean'] = df['text_clean'].apply(emojify_to_words)

print(df[['text','text_clean']].head(20))

                                                                                                                                                                                 text  \
0                                                                                                                                                                       wtb HM male 🙏   
1                                                                                                                                                                                  🔥🔥   
2                                                                                                                                                                                  🔥🔥   
3                                                                                                                                                                                  🔥🔥   
4                                                                          

In [3]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-id'
tok   = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [4]:
uniques = df['text_clean'].unique().tolist()

batch_size = 16
total_batches = (len(uniques) + batch_size - 1) // batch_size

translations = []
start_time = time.time()
for batch_start in tqdm(
    range(0, len(uniques), batch_size),
    desc='Translating',
    unit='batch',
    total=total_batches
):
    chunk = uniques[batch_start:batch_start + batch_size]
    # Tokenisasi & generate
    inputs = tok(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model.generate(**inputs, max_length=128)
    decoded = [tok.decode(t, skip_special_tokens=True) for t in outputs]
    translations.extend(decoded)

elapsed = time.time() - start_time
print(f"\nSelesai menerjemahkan {len(uniques)} kalimat dalam {elapsed:.1f} detik.")

trans_map = dict(zip(uniques, translations))
df['text_id'] = df['text_clean'].map(trans_map)


Translating:   0%|          | 0/302 [00:00<?, ?batch/s]


Selesai menerjemahkan 4827 kalimat dalam 1232.4 detik.


In [8]:
display(df[['text_clean', 'text_id']].head(20))

Unnamed: 0,text_clean,text_id
0,wtb HM male folded hands,wtb HM laki-laki dilipat tangan
1,fire fire,Tembak!
2,fire fire,Tembak!
3,fire fire,Tembak!
4,fire fire,Tembak!
5,fire fire,Tembak!
6,yellow heart,Hati kuning
7,fire fire,Tembak!
8,wtb 10K atau HM,wtb 10K atau HM
9,let's run fire clapping hands,mari kita jalankan api bertepuk tangan


In [9]:
freq = df['text_id'].value_counts().reset_index()
freq.columns = ['text_id', 'count']
print(freq.head(20).to_string(index=False))

                                                                                                                           text_id  count
                                                                                                                    Tembakkan api.    106
                                                                                                                           Tembak!     44
                                                                         Smiling face with heart-eyes smiling face with heart-eyes     24
                                                                                                                        hati merah     15
                                                                                                  Angkat tangan mengangkat tangan.     11
                                                                                                     red heart red heart red heart     10
                                  

In [19]:
manual_map = {
    'Tembakkan api.':                'api api api',
    'Tembak!':                       'api api',
    'Smiling face with heart-eyes':  'Saya suka',
    'Smiling face with heart-eyes smiling face with heart-eyes' : 'Saya suka saya suka',
    'Smiling face with heart-eyes smiling face with heart-eyes smiling face with heart-eyes': 'Saya suka saya suka saya suka',
    'Smiling face with heart-eyes smiling face with heart-eyes smiling face with heart-eyes smiling face with heart-eyes': 'Saya suka saya suka saya suka saya suka',
    'Smiling face with heart-eyes smiling face with heart-eyes smiling face with heart-eyes smiling face with heart-eyes with heart-eys': 'Saya suka saya suka saya suka saya suka saya suka'
}

df['text_id'] = df['text_id'].replace(manual_map)

In [20]:
freq = df['text_id'].value_counts().reset_index()
freq.columns = ['text_id', 'count']
print(freq.head(20).to_string(index=False))

                                                            text_id  count
                                                        api api api    106
                                                            api api     44
                                                Saya suka saya suka     24
                                                         hati merah     15
                                   Angkat tangan mengangkat tangan.     11
                                      red heart red heart red heart     10
                                       Hati kuning hati kuning hati      9
                                      Saya suka saya suka saya suka      9
                       Tepuk tangan bertepuk tangan bertepuk tangan      7
                                                           WTB HM..      6
                  Saya suka saya suka saya suka saya suka saya suka      5
                                                      Angkat tangan      5
                         

In [21]:
output_path = 'DataFix_Translate.csv'
df.to_csv(output_path,index=False,encoding='utf-8')   

print(f"File tersimpan sebagai {output_path}")


File tersimpan sebagai DataFix_Translate.csv
