# Import Library

In [1]:
import pandas as pd
import numpy as np
from preprocessing import cleaningText, casefoldingText, tokenizingText, filteringText, lemmatizerText, fix_slangwords, remove_emoji, toSentence, decontracted, is_latin
pd.options.mode.chained_assignment = None  # Menonaktifkan peringatan chaining
from tqdm import tqdm
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


# Data Loading

In [2]:
df = pd.read_csv('yt_comments_with_language.csv')
df

Unnamed: 0,song_id,song_name,video_id,author,message,published_at,like_count,language
0,734953,,sV2H712ldOI,@SAWTOWNE,‚ú®STREAMING‚ú®\nSpotify: https://open.spotify.com...,2025-01-21T23:53:50Z,6790,en
1,734953,,sV2H712ldOI,@STARGAZER_NEW55,1:38 my hypers3xuality frfr\n\n(Btw I‚Äôm Christ...,2025-06-03T20:10:18Z,0,en
2,734953,,sV2H712ldOI,@Pjskcrkandforsakenfan,"‚ÄúKIERAN IS YOUR AVERAGE TEENAGE VAMPIRE, BUT O...",2025-06-03T20:01:31Z,0,en
3,734953,,sV2H712ldOI,@Al333.w,"2:24 HIYAMA, MY BOY, GET YOUR 4SS OUT OF THERE...",2025-06-03T16:12:22Z,0,en
4,734953,,sV2H712ldOI,@MILOAIS14-w7e,Kuy: help!!! My mine oly is in the üò≠üò≠üò≠üò≠üò≠üò≠üò≠,2025-06-03T13:58:45Z,0,en
...,...,...,...,...,...,...,...,...
12437,741720,,3d_kACCKFRc,@jimakiad,P1 Teto!,2025-02-20T22:59:24Z,0,pt
12438,741720,,3d_kACCKFRc,@„ÇÇ„ÇÇ„ÅÑ„Çç„Åø„Åã„Çì,„É¶ÔºÅ„Åå„ÅØ„ÇÑ„ÇÅ„ÅßÂ¨â„Åó„ÅÑÔºÅ,2025-02-20T22:47:19Z,0,ja
12439,741720,,3d_kACCKFRc,@G_taren,È†≠„ÅÆ‰∏≠„Å´„ÅÇ„ÇãÂêçÂâç„ÇíÂøò„Çå„Åü„Ç≥„É≥„ÉÜ„É≥„ÉÑ„ÇíÈõÜ„ÇÅ„Å¶Âàá„ÇäË≤º„Çä„Åï„Çå„Å¶„Çã„Çà„ÅÜ„Å™Ê∞óÂàÜ,2025-02-20T22:45:19Z,1,ja
12440,741720,,3d_kACCKFRc,@U„ÅÇ„Çì„Åì„Éã„Ç≠,„Éà„É≥„Éà„É≥„Åó„Å¶„Çã„É°„É≠„Éá„Ç£„ÅßÂ•Ω„Åç,2025-02-20T22:34:01Z,0,ja


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12442 entries, 0 to 12441
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   song_id       12442 non-null  int64  
 1   song_name     0 non-null      float64
 2   video_id      12442 non-null  object 
 3   author        12441 non-null  object 
 4   message       12438 non-null  object 
 5   published_at  12442 non-null  object 
 6   like_count    12442 non-null  int64  
 7   language      12442 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 777.8+ KB


In [4]:
df_en = df[df['language'] == 'en']
df_ja = df[df['language'] == 'ja']

print(df_en)
print(df_ja)

       song_id  song_name     video_id                  author  \
0       734953        NaN  sV2H712ldOI               @SAWTOWNE   
1       734953        NaN  sV2H712ldOI        @STARGAZER_NEW55   
2       734953        NaN  sV2H712ldOI  @Pjskcrkandforsakenfan   
3       734953        NaN  sV2H712ldOI                @Al333.w   
4       734953        NaN  sV2H712ldOI          @MILOAIS14-w7e   
...        ...        ...          ...                     ...   
12423   741720        NaN  3d_kACCKFRc            @TrueCrouton   
12424   741720        NaN  3d_kACCKFRc       @Cringemoment4045   
12427   741720        NaN  3d_kACCKFRc               @MacNZ-_-   
12433   741720        NaN  3d_kACCKFRc             @A_Stella_R   
12436   741720        NaN  3d_kACCKFRc          @scarletdevils   

                                                 message  \
0      ‚ú®STREAMING‚ú®\nSpotify: https://open.spotify.com...   
1      1:38 my hypers3xuality frfr\n\n(Btw I‚Äôm Christ...   
2      ‚ÄúKIERAN IS Y

# Data Cleaning

In [5]:
drop_column = ['song_id', 'song_name', 'video_id', 'author', 'published_at', 'like_count', 'language']
df_en.drop(drop_column, axis=1, inplace=True)
print(df_en.head())
df_ja.drop(drop_column, axis=1, inplace=True)
print(df_ja.head())

                                             message
0  ‚ú®STREAMING‚ú®\nSpotify: https://open.spotify.com...
1  1:38 my hypers3xuality frfr\n\n(Btw I‚Äôm Christ...
2  ‚ÄúKIERAN IS YOUR AVERAGE TEENAGE VAMPIRE, BUT O...
3  2:24 HIYAMA, MY BOY, GET YOUR 4SS OUT OF THERE...
4         Kuy: help!!! My mine oly is in the üò≠üò≠üò≠üò≠üò≠üò≠üò≠
                               message
6     2:00 „Åì„Åì„Åã„Çâ„ÅÆ„Ç§„É≥„Çø„Éº„Éç„ÉÉ„Éà„ÇÑ„ÇÅ„Çç(„Éã„Éá„Ç£„Ç¨)ÊÑü„Åæ„Åò„Åß„Åô„Åç
10                    ÁßÅ„ÇÇËÖêÂ•≥Â≠ê„Å†„Åã„ÇâÂÖ±ÊÑü(?)„Åô„ÇãüòÖ
40    ËÖêÂ•≥Â≠ê„Å®„ÅãBL„ÅÆÊñáÂåñ„Å£„Å¶Êó•Êú¨Áã¨Ëá™„Åã„Å®ÊÄù„Å£„Å¶„Åü„Åë„Å©‰∏ñÁïåÂÖ±ÈÄö„Å™„Çì„Å†„Å™
80  „Éû„Ç∏„ÅßÈÅï„Å£„Å¶„Åü„ÇâÂúü‰∏ãÂ∫ß„Åó„Åæ„Åô„Åô„Åø„Åæ„Åõ„Çì\nÊù±Êñπ„ÅÆÊõ≤„Å´ËÅû„Åì„Åà„Å°„Åæ„ÅÜ„ÄÅ„ÄÅ
83               GTA 6„ÅÆÂâç„Å´ÂàùÈü≥„Éü„ÇØ„Åå„ÇÑ„Åä„ÅÑ„Å´„Éè„Éû„Å£„Åü


In [6]:
# Menghapus baris duplikat dari DataFrame df
df_en = df_en.drop_duplicates()
df_ja = df_ja.drop_duplicates()

# Menghitung jumlah baris dan kolom dalam DataFrame df setelah menghapus duplikat
jumlah_ulasan_setelah_hapus_duplikat_en, jumlah_kolom_setelah_hapus_duplikat_en = df_en.shape
jumlah_ulasan_setelah_hapus_duplikat_ja, jumlah_kolom_setelah_hapus_duplikat_ja = df_ja.shape

print(df_en.shape)
print(df_ja.shape)

(4227, 1)
(5305, 1)


# Translating Japanese to English

Menggunakan JParaCrawl V3 (Machine Translation)

In [7]:
import ctranslate2
import sentencepiece as spm
import pandas as pd
from tqdm import tqdm
from text_cleaning_ja import clean_text

# Lokasi model hasil snapshot_download
model_dir = "C:/Users/azelf/.cache/huggingface/hub/models--JustFrederik--jparacrawl-v3-big-ct2/snapshots/d33cf208314c7fd57eeaaa49d8ab6ee26e2ec565"

# Load model translator (arah Jepang ‚Üí Inggris)
translator = ctranslate2.Translator(f"{model_dir}/ja-en", device="cpu", compute_type="int8")

# Load SentencePiece tokenizer
sp_source = spm.SentencePieceProcessor(model_file=f"{model_dir}/spm.ja.nopretok.model")
sp_target = spm.SentencePieceProcessor(model_file=f"{model_dir}/spm.en.nopretok.model")

# Fungsi translate satu kalimat
def translate_text(text):
    try:
        # Tokenisasi input Jepang
        tokens = sp_source.encode(text, out_type=str)

        # Translate
        results = translator.translate_batch([tokens])
        translation_tokens = results[0].hypotheses[0]

        # Decode hasil ke teks Inggris
        translated_text = sp_target.decode(translation_tokens)
        return translated_text
    except Exception as e:
        print(f"Error translating: {e}")
        return None

# Inisialisasi tqdm untuk progress bar
tqdm.pandas()

# Membersihkan teks Jepang sebelum translasi
df_ja["text_cleaned"] = df_ja["message"].progress_apply(lambda x: clean_text(str(x), twitter=True, han2zen=True))
df_ja = df_ja[df_ja["text_cleaned"].str.len() < 500]

# Jalankan translasi
df_ja["text_en"] = df_ja["text_cleaned"].progress_apply(translate_text)

# Simpan hasilnya
df_ja.to_csv("translated_ja_to_en.csv", index=False)
print("‚úÖ Translasi selesai! File disimpan sebagai translated_ja_to_en.csv")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5305/5305 [00:00<00:00, 8864.66it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5286/5286 [20:42<00:00,  4.25it/s]  

‚úÖ Translasi selesai! File disimpan sebagai translated_ja_to_en.csv





# Processing Text

In [None]:
import re

def is_latin(text):
    if not isinstance(text, str):
        return False
    return bool(re.match(r'^[\x00-\x7F\s]+$', text.strip()))

def check_and_return(text):
    if not text or not is_latin(text):
        return None
    return text

def full_preprocessing_pipeline(df, text_column='message'):
    df = df[df[text_column].str.len() <= 500].copy()

    # 1. Cleaning
    df['text_clean'] = df[text_column].astype(str).apply(cleaningText)

    # 2. Remove emoji & drop if not Latin
    # df['text_noemoji'] = df['text_clean'].apply(remove_emoji).apply(check_and_return)
    df['text_noemoji'] = df['text_clean'].apply(remove_emoji)
    df = df.dropna(subset=['text_noemoji'])

    # 3. Case folding
    df['text_casefoldingText'] = df['text_noemoji'].apply(casefoldingText)

    # 4. Decontracted
    df['text_decontracted'] = df['text_casefoldingText'].apply(decontracted)

    # 5. Fix slangwords & drop again if non-Latin
    df['text_slangwords'] = df['text_decontracted'].apply(fix_slangwords)
    df = df.dropna(subset=['text_slangwords'])

    # 6. Lemmatization
    df['text_lemmatized'] = df['text_slangwords'].apply(lemmatizerText)

    # 7. Tokenizing
    df['text_tokenizingText'] = df['text_lemmatized'].apply(tokenizingText)

    # 8. Filtering stopwords
    df['text_stopword'] = df['text_tokenizingText'].apply(filteringText)

    # 9. Drop jika token kosong (list kosong)
    df = df[df['text_stopword'].apply(lambda x: len(x) > 0)]

    # 10. Final sentence
    df['text_akhir'] = df['text_stopword'].apply(toSentence)

    return df

In [8]:
df_ja = pd.read_csv("translated_ja_to_en.csv")

In [9]:
# menggabungkan data bahasa Inggris dan hasil terjemahan
df_ja_en = pd.DataFrame()
df_ja_en['message'] = pd.concat([df_en['message'], df_ja['text_en']], ignore_index=True)
df_ja_en.head()

Unnamed: 0,message
0,‚ú®STREAMING‚ú®\nSpotify: https://open.spotify.com...
1,1:38 my hypers3xuality frfr\n\n(Btw I‚Äôm Christ...
2,"‚ÄúKIERAN IS YOUR AVERAGE TEENAGE VAMPIRE, BUT O..."
3,"2:24 HIYAMA, MY BOY, GET YOUR 4SS OUT OF THERE..."
4,Kuy: help!!! My mine oly is in the üò≠üò≠üò≠üò≠üò≠üò≠üò≠


In [10]:
df_ja_en

Unnamed: 0,message
0,‚ú®STREAMING‚ú®\nSpotify: https://open.spotify.com...
1,1:38 my hypers3xuality frfr\n\n(Btw I‚Äôm Christ...
2,"‚ÄúKIERAN IS YOUR AVERAGE TEENAGE VAMPIRE, BUT O..."
3,"2:24 HIYAMA, MY BOY, GET YOUR 4SS OUT OF THERE..."
4,Kuy: help!!! My mine oly is in the üò≠üò≠üò≠üò≠üò≠üò≠üò≠
...,...
9508,I did it!
9509,I'm happy because I don't see it.
9510,I'm glad that Yu! stopped!
9511,I feel like I'm collecting and cutting and pas...


In [11]:
df_clean = full_preprocessing_pipeline(df_ja_en, text_column='message')

In [12]:
df_clean

Unnamed: 0,message,text_clean,text_noemoji,text_casefoldingText,text_decontracted,text_slangwords,text_lemmatized,text_tokenizingText,text_stopword,text_akhir
0,‚ú®STREAMING‚ú®\nSpotify: https://open.spotify.com...,STREAMING Spotify \r Apple Music,STREAMING Spotify \r Apple Music,streaming spotify \r apple music,streaming spotify \r apple music,streaming spotify apple music,streaming spotify apple music,"[streaming, spotify, apple, music]","[streaming, spotify, apple, music]",streaming spotify apple music
1,1:38 my hypers3xuality frfr\n\n(Btw I‚Äôm Christ...,my hypersxuality frfr Btw Im Christian so thi...,my hypersxuality frfr Btw Im Christian so thi...,my hypersxuality frfr btw im christian so thi...,my hypersxuality frfr by the way i am christi...,my hypersxuality frfr by the way i am christia...,my hypersxuality frfr by the way i am christia...,"[my, hypersxuality, frfr, by, the, way, i, am,...","[hypersxuality, frfr, way, christian, signific...",hypersxuality frfr way christian significant s...
2,"‚ÄúKIERAN IS YOUR AVERAGE TEENAGE VAMPIRE, BUT O...",KIERAN IS YOUR AVERAGE TEENAGE VAMPIRE BUT ONE...,KIERAN IS YOUR AVERAGE TEENAGE VAMPIRE BUT ONE...,kieran is your average teenage vampire but one...,kieran is your average teenage vampire but one...,kieran is your average teenage vampire but one...,kieran is your average teenage vampire but one...,"[kieran, is, your, average, teenage, vampire, ...","[kieran, average, teenage, vampire, one, day, ...",kieran average teenage vampire one day get stu...
3,"2:24 HIYAMA, MY BOY, GET YOUR 4SS OUT OF THERE...",HIYAMA MY BOY GET YOUR SS OUT OF THERE,HIYAMA MY BOY GET YOUR SS OUT OF THERE,hiyama my boy get your ss out of there,hiyama my boy get your ss out of there,hiyama my boy get your so sorry out of there,hiyama my boy get your so sorry out of there,"[hiyama, my, boy, get, your, so, sorry, out, o...","[hiyama, boy, get, sorry]",hiyama boy get sorry
4,Kuy: help!!! My mine oly is in the üò≠üò≠üò≠üò≠üò≠üò≠üò≠,Kuy help My mine oly is in the,Kuy help My mine oly is in the,kuy help my mine oly is in the,kuy help my mine oly is in the,kuy help my mine oly is in the,kuy help my mine oly is in the,"[kuy, help, my, mine, oly, is, in, the]","[kuy, help, mine, oly]",kuy help mine oly
...,...,...,...,...,...,...,...,...,...,...
9507,"In the song and music video, it feels like it'...",In the song and music video it feels like its ...,In the song and music video it feels like its ...,in the song and music video it feels like its ...,in the song and music video it feels like its ...,in the song and music video it feels like its ...,in the song and music video it feel like it hard,"[in, the, song, and, music, video, it, feel, l...","[song, music, video, feel, like, hard]",song music video feel like hard
9509,I'm happy because I don't see it.,Im happy because I dont see it,Im happy because I dont see it,im happy because i dont see it,i am happy because i do not see it,i am happy because i do not see it,i am happy because i do not see it,"[i, am, happy, because, i, do, not, see, it]","[happy, see]",happy see
9510,I'm glad that Yu! stopped!,Im glad that Yu stopped,Im glad that Yu stopped,im glad that yu stopped,i am glad that yu stopped,i am glad that yu stopped,i am glad that yu stopped,"[i, am, glad, that, yu, stopped]","[glad, yu, stopped]",glad yu stopped
9511,I feel like I'm collecting and cutting and pas...,I feel like Im collecting and cutting and past...,I feel like Im collecting and cutting and past...,i feel like im collecting and cutting and past...,i feel like i am collecting and cutting and pa...,i feel like i am collecting and cutting and pa...,i feel like i am collecting and cutting and pa...,"[i, feel, like, i, am, collecting, and, cuttin...","[feel, like, collecting, cutting, pasting, con...",feel like collecting cutting pasting content f...


In [13]:
df_clean.to_csv('processed.csv', index=False)

# Data Labelling

In [14]:
import requests

# Load English positive words
response = requests.get('https://gist.githubusercontent.com/mkulakowski2/4289437/raw/positive-words.txt')
if response.status_code == 200:
    lexicon_positive = set()
    for line in response.text.splitlines():
        if not line.startswith(";") and line.strip():  # Skip comments and empty lines
            lexicon_positive.add(line.strip())
else:
    print("Failed to fetch English positive lexicon")

# Load English negative words
response = requests.get('https://gist.githubusercontent.com/mkulakowski2/4289441/raw/negative-words.txt')
if response.status_code == 200:
    lexicon_negative = set()
    for line in response.text.splitlines():
        if not line.startswith(";") and line.strip():
            lexicon_negative.add(line.strip())
else:
    print("Failed to fetch English negative lexicon")

In [15]:
def sentiment_analysis_lexicon(text):
    score = 0

    for word in text:
        if word in lexicon_positive:
            score += 1  # Tambah 1 untuk kata positif

    for word in text:
        if word in lexicon_negative:
            score -= 1  # Kurangi 1 untuk kata negatif

    if score >= 0:
        polarity = 'positive'
    elif score <= 0:
        polarity = 'negative'
    # else:
    #     polarity = 'neutral'  # Bisa juga dibedakan jika ingin netral
        
    return score, polarity

In [16]:
results = df_clean['text_stopword'].apply(sentiment_analysis_lexicon)
results = list(zip(*results))
df_clean['polarity_score'] = results[0]
df_clean['polarity'] = results[1]
print(df_clean['polarity'].value_counts())

polarity
positive    7732
negative    1617
Name: count, dtype: int64


In [17]:
print(df_clean[['polarity_score','polarity']].value_counts())

polarity_score  polarity
 0              positive    3402
 1              positive    2558
-1              negative    1143
 2              positive    1136
 3              positive     373
-2              negative     314
 4              positive     150
-3              negative     105
 5              positive      63
-4              negative      38
 6              positive      32
-5              negative       9
 7              positive       8
-6              negative       4
 8              positive       3
 9              positive       3
-9              negative       2
 11             positive       1
 15             positive       1
-30             negative       1
 10             positive       1
-10             negative       1
 21             positive       1
Name: count, dtype: int64


In [18]:
# simpan hasil jadi csv
df_clean.to_csv("data labeled.csv", index=False)