# 1. Configuration


## 1.1 Import Library

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
import html
import string
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from typing import Union, Tuple, List, Dict, Any

## 1.2 Define Functions

### 1.2.1 Load Data

In [3]:
def load_csv_data(file_path: str, text_column: str, label_column: str, 
                  encoding: str = 'utf-8') -> pd.DataFrame:
    """
    Import data dari file CSV
    
    Args:
        file_path: Path ke file CSV
        text_column: Nama kolom yang berisi teks
        label_column: Nama kolom yang berisi label/kelas
        encoding: Encoding file (default: utf-8)
    
    Returns:
        DataFrame dengan data yang sudah diload
    """
    try:
        data = pd.read_csv(file_path, encoding=encoding)
        
        # Validasi kolom yang diperlukan ada
        if text_column not in data.columns:
            raise ValueError(f"Kolom '{text_column}' tidak ditemukan")
        if label_column not in data.columns:
            raise ValueError(f"Kolom '{label_column}' tidak ditemukan")
        
        # Bersihkan data dari nilai kosong
        data = data.dropna(subset=[text_column, label_column])
        
        print(f"Data berhasil diload: {len(data)} sampel")
        print(f"Distribusi kelas:")
        print(data[label_column].value_counts())
        
        return data
        
    except Exception as e:
        print(f"Error saat load CSV: {str(e)}")
        return None

### 1.2.2 Preview Data

In [4]:
def preview_data(data: pd.DataFrame, n_samples: int = 5, include_stats: bool = True) -> Dict[str, Any]:
   preview_info = {
       'dataset_shape': data.shape,
       'column_names': list(data.columns),
       'data_types': data.dtypes.to_dict(),
       'memory_usage': f"{data.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
       'head_samples': data.head(n_samples),
       'tail_samples': data.tail(n_samples),
       'missing_values': data.isnull().sum().to_dict(),
       'missing_percentage': (data.isnull().sum() / len(data) * 100).round(2).to_dict()
   }
   
   if include_stats:
       numeric_cols = data.select_dtypes(include=[np.number]).columns
       categorical_cols = data.select_dtypes(include=['object', 'category']).columns
       
       if len(numeric_cols) > 0:
           preview_info['numeric_statistics'] = data[numeric_cols].describe()
       
       if len(categorical_cols) > 0:
           preview_info['categorical_info'] = {}
           for col in categorical_cols:
               preview_info['categorical_info'][col] = {
                   'unique_count': data[col].nunique(),
                   'top_values': data[col].value_counts().head().to_dict()
               }
   
   return preview_info

In [5]:
def display_data_overview(data: pd.DataFrame, target_column: str = None):
   info = preview_data(data)
   
   print("=" * 60)
   print("DATASET OVERVIEW")
   print("=" * 60)
   print(f"Shape: {info['dataset_shape'][0]} rows × {info['dataset_shape'][1]} columns")
   print(f"Memory Usage: {info['memory_usage']}")
   print(f"Columns: {', '.join(info['column_names'])}")
   
   print("\n" + "=" * 60)
   print("DATA TYPES")
   print("=" * 60)
   for col, dtype in info['data_types'].items():
       print(f"{col}: {dtype}")
   
   print("\n" + "=" * 60)
   print("MISSING VALUES")
   print("=" * 60)
   for col, missing in info['missing_values'].items():
       percentage = info['missing_percentage'][col]
       print(f"{col}: {missing} ({percentage}%)")
   
   if target_column and target_column in data.columns:
       print(f"\n" + "=" * 60)
       print(f"TARGET VARIABLE: {target_column}")
       print("=" * 60)
       print(data[target_column].value_counts())
       print(f"\nClass Distribution:")
       print((data[target_column].value_counts() / len(data) * 100).round(2))
   
   print(f"\n" + "=" * 60)
   print("SAMPLE DATA (First 5 rows)")
   print("=" * 60)
   print(info['head_samples'])
   
   if 'categorical_info' in info:
       print(f"\n" + "=" * 60)
       print("CATEGORICAL COLUMNS SUMMARY")
       print("=" * 60)
       for col, cat_info in info['categorical_info'].items():
           print(f"\n{col}:")
           print(f"  Unique values: {cat_info['unique_count']}")
           print(f"  Top values: {cat_info['top_values']}")

In [6]:
df = load_csv_data('../data/Youtube-Spam-Dataset.csv',
                   text_column='CONTENT',
                   label_column='CLASS',
                   encoding='latin-1')

Data berhasil diload: 1956 sampel
Distribusi kelas:
CLASS
1    1005
0     951
Name: count, dtype: int64


In [7]:
display_data_overview(df, target_column='CLASS')

DATASET OVERVIEW
Shape: 1956 rows × 6 columns
Memory Usage: 0.87 MB
Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, VIDEO_NAME, CLASS

DATA TYPES
COMMENT_ID: object
AUTHOR: object
DATE: object
CONTENT: object
VIDEO_NAME: object
CLASS: int64

MISSING VALUES
COMMENT_ID: 0 (0.0%)
AUTHOR: 0 (0.0%)
DATE: 245 (12.53%)
CONTENT: 0 (0.0%)
VIDEO_NAME: 0 (0.0%)
CLASS: 0 (0.0%)

TARGET VARIABLE: CLASS
CLASS
1    1005
0     951
Name: count, dtype: int64

Class Distribution:
CLASS
1    51.38
0    48.62
Name: count, dtype: float64

SAMPLE DATA (First 5 rows)
                                    COMMENT_ID            AUTHOR  \
0  LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   
1  LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   
2  LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   
3          z13jhp0bxqncu512g22wvzkasxmvvzjaz04   ElNino Melendez   
4          z13fwbwp1oujthgqj04chlngpvzmtt3r3dw            GsMega   

                  DATE                       

### 1.2.2 Convert Lower Case

In [8]:
def convert_to_lowercase(data: Union[str, pd.DataFrame, pd.Series, List[str], np.ndarray], 
                       column: str = None) -> Union[str, pd.DataFrame, pd.Series, List[str], np.ndarray]:
   if isinstance(data, str):
       return data.lower()
   
   elif isinstance(data, pd.DataFrame):
       if column is None:
           raise ValueError("Parameter 'column' harus diisi untuk DataFrame")
       result = data.copy()
       result[column] = result[column].astype(str).str.lower()
       return result
   
   elif isinstance(data, pd.Series):
       return data.astype(str).str.lower()
   
   elif isinstance(data, list):
       return [str(text).lower() for text in data]
   
   elif isinstance(data, np.ndarray):
       return np.array([str(text).lower() for text in data])
   
   else:
       raise TypeError(f"Tipe data {type(data)} tidak didukung")


In [9]:
converted_df = convert_to_lowercase(df, column='CONTENT')
converted_df['CONTENT'].head()

0    huh, anyway check out this you[tube] channel: ...
1    hey guys check out my new channel and our firs...
2               just for test i have to say murdev.com
3    me shaking my sexy ass on my channel enjoy ^_^...
4            watch?v=vtarggvgtwq   check this out .ï»¿
Name: CONTENT, dtype: object

### 1.2.4 remove_noise

In [10]:
def remove_noise(text):
    '''Remove unwanted characters while preserving important features for spam/ham classification'''
    
    if not text or pd.isna(text):
        return ''
    
    text = str(text)
    
    # Decode HTML entities like &amp;, &lt;, &gt;
    text = html.unescape(text)
    
    # Remove HTML tags first
    text = re.sub(r'<[^>]*>', ' ', text)
    
    # Replace emails with token
    text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', ' EMAILADDRESS ', text)
    
    # Crypto/wallet addresses (long random strings)
    text = re.sub(r'\b[a-zA-Z0-9]{25,}\b', ' CRYPTOADDRESS ', text)

    # Handle spaced domains BEFORE general URL detection
    # "kidsmediausa .com" → "kidsmediausa.com" (more targeted)
    text = re.sub(
    r'([a-zA-Z0-9_-]+)\s*\.\s*(com|net|org|id|co|uk|info|biz)\s*(/[^\s]*)?', 'URL',
    text)

    text = re.sub(
    r'\b(?:https?://|www\.)?[\w.-]+\.[a-z]{2,}(?:/[^\s]*)*',
    ' URL ',
    text,
    flags=re.IGNORECASE
)

    # Replace URLs with contextual tokens (preserve the spam signal)
    # Full URLs
    text = re.sub(
        r'''(?ix)
        \b(?:https?:?/?/?|http:?/?/?|www\.)
        [\w\-]+(\.[\w\-.]+)+
        (/[^\s]*)?
        ''',
        ' URL ',
        text
    )
    
    # Shortened URLs
    text = re.sub(
        r'\b(?:bit|adf|tinyurl|rebrand|shorturl|is\.gd|shorte|cutt|t\.co|lnkd)\.(?:ly|me|com|co|gd|io)(?:/\S*)?',
        ' URL ',
        text
    )
    
    # YouTube video links
    text = re.sub(r'watch\?v=\S+', ' URL ', text)
    
    # Domain-only patterns (common in spam)
    text = re.sub(r'\b[a-zA-Z0-9_-]+\.(?:com|net|org|id|co|uk)\b', ' URL ', text)
    

    
    # Replace ordinal numbers
    text = re.sub(r'\b\d+(st|nd|rd|th)\b', 'NUM', text)
    
    # Replace various number formats with more specific tokens
    # Ganti semua angka dan angka+huruf satuan menjadi NUM
    text = re.sub(
    r'\b\d+(\.\d+)?([eE][+-]?\d+)?([a-zA-Z]+)?\b',
    'NUM',
    text)

    # Remove repetitive symbols (e.g., â–Œ repeated multiple times)
    text = re.sub(r'(\W)\1+', r'\1', text)  # Replace repeated non-word characters with a single one

    # Hapus emotikon seperti :) :p :( :D :-P dll
    text = re.sub(r'[:;=xX8][-~^]?[)(dDPpOo3|/\\]', ' ', text)

    # Fix separated letters but preserve intentional spacing for emphasis
    # "d-d-d-d" → "dddd" (stammering pattern)
    text = re.sub(r'\b([a-zA-Z])-([a-zA-Z])-([a-zA-Z]+(?:-[a-zA-Z])*)\b', 
                  lambda m: m.group(0).replace('-', ''), text)
    
    # "p e a c e" → "peace" but only if it's clearly unintentional
    text = re.sub(r'\b([a-zA-Z])\s+([a-zA-Z])\s+([a-zA-Z])\s+([a-zA-Z]+(?:\s+[a-zA-Z])*)\b',
                  lambda m: m.group(0).replace(' ', ''), text)
    
    # Remove BOM and problematic Unicode
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    
    # Remove BOM and all non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Removes characters like ï»¿ or emojis
    
    # Clean up excessive special characters (but keep our tokens)
    # Remove everything except letters, numbers, spaces, and our preserved tokens
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Normalize repeated characters (but preserve our spam indicators)
    # "hellooooo" → "helloo" (reduce but don't eliminate completely)
    text = re.sub(r'([a-zA-Z])\1{3,}', r'\1\1', text)
    
    # Consolidate multiple instances of our tokens
    text = re.sub(r'\b(NUM)(\s+\1)+\b', r'\1', text)
    
    # Final whitespace cleanup
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# 2. Data Cleaning