In [1]:
"""
Notebook 1: Data Preparation
Persiapan dan preprocessing data komentar TikTok
"""

import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


# ===== LOAD DATA =====
print("\n=== LOADING DATA ===")
df = pd.read_csv('../data/raw/Dataset_komentar_Tiktok.csv', on_bad_lines='skip', engine='python' , encoding='utf-8', delimiter=';')
print(f"Total data: {len(df)} baris")
print(f"\nKolom: {df.columns.tolist()}")
print(f"\nContoh data:")
print(df.head(10))


=== LOADING DATA ===
Total data: 2109 baris

Kolom: ['No', 'komentar', 'label', 'e']

Contoh data:
     No                                           komentar  label  \
0   1.0                               Jule L nya apa Lntee      0   
1   2.0                 disuruh milib oppa malh milih opet      0   
2   3.0                                        jule anj...      0   
3   4.0  hancur banget hatiku ketika merasakan hatinya ...      1   
4   5.0                oon bener kurang apa cobak suami lu      0   
5   6.0  ð¬ð¢ð¥ðšð§ðšð§ð  ð›ðžðð...      0   
6   7.0  dari dulu aja hobi selingkuh yaa sekarang masi...      1   
7   8.0  emng kalau org lagi selingkuh itu buta segala2...      0   
8   9.0     di kasi diamond malah milih batu bata ðŸ˜‚ðŸ˜‚      0   
9  10.0                         jule ga kasian sm suami ya      1   

                                  e  
0  https://vt.tiktok.com/ZSyftmuPr/  
1                               NaN  
2                         

In [2]:
# mengambil hanya kolom komentar dan label
print("\n=== FILTERING COLUMNS ===")

df_filtered = df[['komentar', 'label']]
print(f"\nData setelah difilter (kolom komentar dan label):")
print(df_filtered.head(10))

# ===== SAVE FILTERED DATA =====
output_dir = '../data/raw'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)  
df_filtered.to_csv(os.path.join(output_dir, 'filtered_data_comments.csv'), index=False)


=== FILTERING COLUMNS ===

Data setelah difilter (kolom komentar dan label):
                                            komentar  label
0                               Jule L nya apa Lntee      0
1                 disuruh milib oppa malh milih opet      0
2                                        jule anj...      0
3  hancur banget hatiku ketika merasakan hatinya ...      1
4                oon bener kurang apa cobak suami lu      0
5  ð¬ð¢ð¥ðšð§ðšð§ð  ð›ðžðð...      0
6  dari dulu aja hobi selingkuh yaa sekarang masi...      1
7  emng kalau org lagi selingkuh itu buta segala2...      0
8     di kasi diamond malah milih batu bata ðŸ˜‚ðŸ˜‚      0
9                         jule ga kasian sm suami ya      1


In [3]:
# Mengubah nama kolom
df_filtered = df_filtered.rename(columns={'komentar': 'comment', 'label': 'sentiment'})

# Mengubah nilai sentiment: 0 menjadi -1, 1 tetap 1
df_filtered['sentiment'] = df_filtered['sentiment'].replace({0: -1, 1: 1})

print("Dataset dengan Comment dan Sentiment:\n")
print(df_filtered)
print("\n" + "="*50)
print(f"Total data: {len(df_filtered)}")

# ===== SAVE FILTERED DATA =====
output_dir = '../data/raw'  
if not os.path.exists(output_dir):
    os.makedirs(output_dir)  
df_filtered.to_csv(os.path.join(output_dir, 'filtered_data_comments_sentiment.csv'), index=False)

Dataset dengan Comment dan Sentiment:

                                                comment  sentiment
0                                  Jule L nya apa Lntee         -1
1                    disuruh milib oppa malh milih opet         -1
2                                           jule anj...         -1
3     hancur banget hatiku ketika merasakan hatinya ...          1
4                   oon bener kurang apa cobak suami lu         -1
...                                                 ...        ...
2104  "aku memutuskan untuk menjadi fans isyana. gil...          1
2105  "AMZING ISYANAA!! Jujur aku amazed banget deng...          1
2106  "paling ngiri liat orang keren maen alat musik...          1
2107  "Sampe ga bisa berkata2 lagi buat isyana, sang...          1
2108  "Inimah bukan main alat musik lagi. Olahraga j...          1

[2109 rows x 2 columns]

Total data: 2109


In [4]:


# ===== DATA EXPLORATION =====
print("\n=== EXPLORASI DATA ===")
print(f"\nDistribusi Label:")
print(df_filtered['sentiment'].value_counts())
print(f"\nPersentase:")
print(df_filtered['sentiment'].value_counts(normalize=True) * 100)

print(f"\nStatistik Panjang Komentar:")
df_filtered['comment_length'] = df_filtered['comment'].str.len()
print(df_filtered['comment_length'].describe())

print(f"\nCek missing values:")
print(df_filtered.isnull().sum())


=== EXPLORASI DATA ===

Distribusi Label:
sentiment
-1    1058
 1    1051
Name: count, dtype: int64

Persentase:
sentiment
-1    50.165955
 1    49.834045
Name: proportion, dtype: float64

Statistik Panjang Komentar:
count    2109.000000
mean       61.277383
std        48.448826
min         3.000000
25%        33.000000
50%        48.000000
75%        75.000000
max       581.000000
Name: comment_length, dtype: float64

Cek missing values:
comment           0
sentiment         0
comment_length    0
dtype: int64


In [5]:
# ===== TEXT PREPROCESSING =====
print("\n=== PREPROCESSING TEXT ===")

def clean_text(text):
    """Membersihkan teks dari karakter tidak perlu"""
    if pd.isna(text):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Hapus URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Hapus mention dan hashtag (opsional - bisa mengandung informasi)
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Hapus karakter khusus tapi pertahankan huruf Indonesia
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df_filtered['cleaned_comment'] = df_filtered['comment'].apply(clean_text)

print("Contoh hasil cleaning:")
for i in range(5):
    print(f"\nOriginal: {df_filtered['comment'].iloc[i]}")
    print(f"Cleaned:  {df_filtered['cleaned_comment'].iloc[i]}")


=== PREPROCESSING TEXT ===
Contoh hasil cleaning:

Original: Jule L nya apa Lntee
Cleaned:  jule l nya apa lntee

Original: disuruh milib oppa malh milih opet
Cleaned:  disuruh milib oppa malh milih opet

Original: jule anj...
Cleaned:  jule anj

Original: hancur banget hatiku ketika merasakan hatinya daehoon ia merawat anak sendirian tanpa seorang istriðŸ¥ºðŸ˜©ðŸ˜¢
Cleaned:  hancur banget hatiku ketika merasakan hatinya daehoon ia merawat anak sendirian tanpa seorang istri

Original: oon bener kurang apa cobak suami lu
Cleaned:  oon bener kurang apa cobak suami lu


In [6]:
# ===== LABEL ENCODING =====
print("\n=== ENCODING LABEL ===")
# Convert -1 (Cyberbullying) menjadi 1, dan 1 (Non-Cyberbullying) menjadi 0
# Untuk binary classification yang lebih intuitif
df_filtered['label'] = df_filtered['sentiment'].apply(lambda x: 1 if x == -1 else 0)
# df_filtered['label'] = df_filtered['sentiment'].apply(lambda x: 0 if x == -1 else 1)
print(f"\nLabel mapping:")
print("- Label 0: Non-Cyberbullying (Original: 1)")
print("- Label 1: Cyberbullying (Original: -1)")
print(f"\nDistribusi label baru:")
print(df_filtered['label'].value_counts())

# ===== SPLIT DATA =====
print("\n=== SPLITTING DATA ===")
# Stratified split untuk mempertahankan proporsi kelas
train_df, temp_df = train_test_split(
    df_filtered, 
    test_size=0.3, 
    random_state=42, 
    stratify=df_filtered['label']
)

val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_df['label']
)

print(f"Train set: {len(train_df)} samples ({len(train_df)/len(df_filtered)*100:.1f}%)")
print(f"Val set:   {len(val_df)} samples ({len(val_df)/len(df_filtered)*100:.1f}%)")
print(f"Test set:  {len(test_df)} samples ({len(test_df)/len(df_filtered)*100:.1f}%)")

print(f"\nDistribusi label di setiap split:")
print("Train:", train_df['label'].value_counts().to_dict())
print("Val:  ", val_df['label'].value_counts().to_dict())
print("Test: ", test_df['label'].value_counts().to_dict())


=== ENCODING LABEL ===

Label mapping:
- Label 0: Non-Cyberbullying (Original: 1)
- Label 1: Cyberbullying (Original: -1)

Distribusi label baru:
label
1    1058
0    1051
Name: count, dtype: int64

=== SPLITTING DATA ===
Train set: 1476 samples (70.0%)
Val set:   316 samples (15.0%)
Test set:  317 samples (15.0%)

Distribusi label di setiap split:
Train: {1: 740, 0: 736}
Val:   {1: 159, 0: 157}
Test:  {1: 159, 0: 158}


In [7]:
print("\nData preparation selesai.")
print(df_filtered.head(10))


Data preparation selesai.
                                             comment  sentiment  \
0                               Jule L nya apa Lntee         -1   
1                 disuruh milib oppa malh milih opet         -1   
2                                        jule anj...         -1   
3  hancur banget hatiku ketika merasakan hatinya ...          1   
4                oon bener kurang apa cobak suami lu         -1   
5  ð¬ð¢ð¥ðšð§ðšð§ð  ð›ðžðð...         -1   
6  dari dulu aja hobi selingkuh yaa sekarang masi...          1   
7  emng kalau org lagi selingkuh itu buta segala2...         -1   
8     di kasi diamond malah milih batu bata ðŸ˜‚ðŸ˜‚         -1   
9                         jule ga kasian sm suami ya          1   

   comment_length                                    cleaned_comment  label  
0              20                               jule l nya apa lntee      1  
1              34                 disuruh milib oppa malh milih opet      1  
2

In [8]:
data_filter = df_filtered[['cleaned_comment', 'label']]
print(f"\nData setelah difilter (kolom komentar dan label):")
print(data_filter.head(10))

# data_filter.to_csv('../data/processed/full_data.csv', index=False)
# print(save)


Data setelah difilter (kolom komentar dan label):
                                     cleaned_comment  label
0                               jule l nya apa lntee      1
1                 disuruh milib oppa malh milih opet      1
2                                           jule anj      1
3  hancur banget hatiku ketika merasakan hatinya ...      0
4                oon bener kurang apa cobak suami lu      1
5                                                         1
6  dari dulu aja hobi selingkuh yaa sekarang masi...      0
7  emng kalau org lagi selingkuh itu buta segala2...      1
8              di kasi diamond malah milih batu bata      1
9                         jule ga kasian sm suami ya      0


In [9]:
# ===== SAVE PROCESSED DATA =====
print("\n=== MENYIMPAN DATA ===")
train_df.to_csv('../data/processed/train.csv', index=False)
val_df.to_csv('../data/processed/val.csv', index=False)
test_df.to_csv('../data/processed/test.csv', index=False)
data_filter.to_csv('../data/processed/full_data.csv', index=False)

print("✓ Data berhasil disimpan di folder ../data/processed/")
print("\n=== PERSIAPAN DATA SELESAI ===")
print("Lanjutkan ke notebook 02_exploratory_analysis.ipynb")


=== MENYIMPAN DATA ===
✓ Data berhasil disimpan di folder ../data/processed/

=== PERSIAPAN DATA SELESAI ===
Lanjutkan ke notebook 02_exploratory_analysis.ipynb
