# 1. Configuration


## 1.1 Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
import html
import string
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer   
from sklearn.model_selection import train_test_split
from typing import Union, Tuple, List, Dict, Any
import sys
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('omw-1.4') 
import emoji

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 1.2 Define Functions

### 1.2.1 Load Data

In [2]:
def load_csv_data(file_path: str, text_column: str, label_column: str, 
                  encoding: str = 'utf-8') -> pd.DataFrame:
    """
    Import data dari file CSV
    
    Args:
        file_path: Path ke file CSV
        text_column: Nama kolom yang berisi teks
        label_column: Nama kolom yang berisi label/kelas
        encoding: Encoding file (default: utf-8)
    
    Returns:
        DataFrame dengan data yang sudah diload
    """
    try:
        data = pd.read_csv(file_path, encoding=encoding)
        
        # Validasi kolom yang diperlukan ada
        if text_column not in data.columns:
            raise ValueError(f"Kolom '{text_column}' tidak ditemukan")
        if label_column not in data.columns:
            raise ValueError(f"Kolom '{label_column}' tidak ditemukan")
        
        # Bersihkan data dari nilai kosong
        data = data.dropna(subset=[text_column, label_column])
        
        print(f"Data berhasil diload: {len(data)} sampel")
        print(f"Distribusi kelas:")
        print(data[label_column].value_counts())
        
        return data
        
    except Exception as e:
        print(f"Error saat load CSV: {str(e)}")
        return None

### 1.2.2 Preview Data

In [3]:
def preview_data(data: pd.DataFrame, n_samples: int = 5, include_stats: bool = True) -> Dict[str, Any]:
   preview_info = {
       'dataset_shape': data.shape,
       'column_names': list(data.columns),
       'data_types': data.dtypes.to_dict(),
       'memory_usage': f"{data.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
       'head_samples': data.head(n_samples),
       'tail_samples': data.tail(n_samples),
       'missing_values': data.isnull().sum().to_dict(),
       'missing_percentage': (data.isnull().sum() / len(data) * 100).round(2).to_dict()
   }
   
   if include_stats:
       numeric_cols = data.select_dtypes(include=[np.number]).columns
       categorical_cols = data.select_dtypes(include=['object', 'category']).columns
       
       if len(numeric_cols) > 0:
           preview_info['numeric_statistics'] = data[numeric_cols].describe()
       
       if len(categorical_cols) > 0:
           preview_info['categorical_info'] = {}
           for col in categorical_cols:
               preview_info['categorical_info'][col] = {
                   'unique_count': data[col].nunique(),
                   'top_values': data[col].value_counts().head().to_dict()
               }
   
   return preview_info

In [4]:
def display_data_overview(data: pd.DataFrame, target_column: str = None):
   info = preview_data(data)
   
   print("=" * 60)
   print("DATASET OVERVIEW")
   print("=" * 60)
   print(f"Shape: {info['dataset_shape'][0]} rows × {info['dataset_shape'][1]} columns")
   print(f"Memory Usage: {info['memory_usage']}")
   print(f"Columns: {', '.join(info['column_names'])}")
   
   print("\n" + "=" * 60)
   print("DATA TYPES")
   print("=" * 60)
   for col, dtype in info['data_types'].items():
       print(f"{col}: {dtype}")
   
   print("\n" + "=" * 60)
   print("MISSING VALUES")
   print("=" * 60)
   for col, missing in info['missing_values'].items():
       percentage = info['missing_percentage'][col]
       print(f"{col}: {missing} ({percentage}%)")
   
   if target_column and target_column in data.columns:
       print(f"\n" + "=" * 60)
       print(f"TARGET VARIABLE: {target_column}")
       print("=" * 60)
       print(data[target_column].value_counts())
       print(f"\nClass Distribution:")
       print((data[target_column].value_counts() / len(data) * 100).round(2))
   
   print(f"\n" + "=" * 60)
   print("SAMPLE DATA (First 5 rows)")
   print("=" * 60)
   print(info['head_samples'])
   
   if 'categorical_info' in info:
       print(f"\n" + "=" * 60)
       print("CATEGORICAL COLUMNS SUMMARY")
       print("=" * 60)
       for col, cat_info in info['categorical_info'].items():
           print(f"\n{col}:")
           print(f"  Unique values: {cat_info['unique_count']}")
           print(f"  Top values: {cat_info['top_values']}")

### 1.2.2 Convert Lower Case

In [5]:
def convert_to_lowercase(data: Union[str, pd.DataFrame, pd.Series, List[str], np.ndarray], 
                       column: str = None) -> Union[str, pd.DataFrame, pd.Series, List[str], np.ndarray]:
   if isinstance(data, str):
       return data.lower()
   
   elif isinstance(data, pd.DataFrame):
       if column is None:
           raise ValueError("Parameter 'column' harus diisi untuk DataFrame")
       result = data.copy()
       result[column] = result[column].astype(str).str.lower()
       return result
   
   elif isinstance(data, pd.Series):
       return data.astype(str).str.lower()
   
   elif isinstance(data, list):
       return [str(text).lower() for text in data]
   
   elif isinstance(data, np.ndarray):
       return np.array([str(text).lower() for text in data])
   
   else:
       raise TypeError(f"Tipe data {type(data)} tidak didukung")


### 1.2.4 remove_noise

In [50]:
def combine_emojis(text):
    """Process emoji dengan benar"""
    if not isinstance(text, str) or pd.isna(text):
        return ''
    
    # Deteksi dan pisahkan emoji
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+", 
        flags=re.UNICODE
    )
    
    # Tambahkan spasi di sekitar emoji
    text = emoji_pattern.sub(r' \g<0> ', text)
    
    # Konversi emoji ke nama
    text = emoji.demojize(text, language='en')
    
    # Perbaiki format :emoji_name: yang berurutan
    text = re.sub(r'(:([a-zA-Z0-9_]+):)(?=:[a-zA-Z0-9_]+:)', r'\1 ', text)
    
    # Ubah :emoji_name: menjadi emoji_name
    text = re.sub(r':([a-zA-Z0-9_]+):', r' \1 ', text)
    
    # Clean multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


In [115]:
def remove_noise1(text):
    """Remove unwanted characters while preserving important features for classification"""

    if not isinstance(text, str) or pd.isna(text):
        return ''
    
    # 1. Decode HTML entities
    text = html.unescape(text)

    # 2. Remove HTML tags
    text = re.sub(r'<[^>]*>', ' ', text)

    # 3. Capture emoticons
    emoticons = re.findall(r'(\^_\^|:D|:P|<3|;-\)|;P|T_T|:\)|:\(|;D|xD)', text)

    # 4. Process emoji early so they don't get deleted
    text = combine_emojis(text)

    # 5. Clean Unicode/BOM characters (jaga spasi biar kata tidak nempel)
    text = text.replace('\ufeff', ' ')
    text = re.sub(r'[\u200b\u200c\u200d\x00-\x1f\x7f-\x9f]', ' ', text)

    # 6. Normalize width
    text = unicodedata.normalize('NFKC', text)

    # 7. Replace emails with token
    text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', ' EMAILADDRESS ', text)

    # 8. Fix spaced URLs
    text = re.sub(r'\b(?:h\s*t\s*t\s*p(?:\s*s)?)\s*[:]\s*/\s*/(?:\s*\w\s*)+', 
                  lambda m: m.group(0).replace(' ', ''), text)

    # 9. Remove spaces around dots
    text = re.sub(r'\s*\.\s*', '.', text)

    # 10. Replace URLs with token
    url_regex = re.compile(r'(?i)\b((?:https?|ftp):\/\/|www\.)[^\s\'"<>]+')
    text = url_regex.sub(' url ', text)
    text = re.sub(r'\bwatch\?v=[\w-]+(?:&[\w=]*)*', 'url', text)
    domain_regex = re.compile(
        r'\b([a-zA-Z0-9-]{2,})\s*\.\s*(com|net|org|ly|id|co|uk|info|biz|io|me|tv|cc|asia|dev|app|xyz)\b',
        re.IGNORECASE
    )
    text = domain_regex.sub('url', text)
    obfuscated_url_pattern = re.compile(
        r'\b(?:h\s*t\s*t\s*p|h\s*t\s*t\s*p\s*s)\s*[:]\s*/\s*/(?:\s*\w\s*)+',
        re.IGNORECASE
    )
    text = obfuscated_url_pattern.sub('url', text)


    # 11. Remove hash-like strings
    text = re.sub(r'\b[a-fA-F0-9]{32,}\b', ' ', text)

    # 12. Numbers handling
    text = re.sub(r'\b\d+(?:gb|mb|kb|tb|billion|million|k|m|yr|year|min|sec|dollar|usd|bit|hz|mp|mm|cm|inch|in)\b',
                  'numeric', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+(st|nd|rd|th)\b', 'numeric', text)
    text = re.sub(r'\b\d{4,}\b', 'numeric', text)
    text = re.sub(r'\b\d{1,3}\b', 'numeric', text)
    text = re.sub(r'\b\d+x10\^\d+\b', 'numeric', text)

    # 13. Remove brackets but keep contents
    text = re.sub(r'\[(.*?)\]', r'\1', text)

    # 14. Clean apostrophes in wrong places
    text = re.sub(r"(?<![a-zA-Z])'|'(?![a-zA-Z])", ' ', text)

    # 15. Remove unwanted non-ASCII chars but keep letters, numbers, space, apostrophes
    text = re.sub(r"[^a-zA-Z0-9\s']", ' ', text)

    # 16. Merge repeated symbols
    text = re.sub(r'(\W)\1+', r'\1', text)

    # 17. Remove standalone underscores
    text = re.sub(r'(?<!\w)_+(?!\w)', '', text)

    # 18. Merge separated letters with hyphen or space
    text = re.sub(r'\b([a-zA-Z])-([a-zA-Z])-([a-zA-Z]+(?:-[a-zA-Z])*)\b',
                  lambda m: m.group(0).replace('-', ''), text)
    text = re.sub(r'\b(?:[a-zA-Z]\s+){2,}[a-zA-Z]\b',
                  lambda m: m.group(0).replace(' ', ''), text)

    # 19. Normalize character repetitions → langsung jadi 1 huruf
    text = re.sub(r'([a-zA-Z])\1{1,}', r'\1', text)

    # 20. Normalize "haaa" → "ha"
    text = re.sub(r'\b(ha){2,}\b', 'ha', text)
    
    # 21. Consolidate multiple tokens
    text = re.sub(r'\b(numeric|url|EMAILADDRESS)(\s+\1)+\b', r'\1', text)
    
    # 22. Add back emoticons
    text = text + ' ' + ' '.join(emoticons).replace('-', '')
    
    # 23. Final cleanup
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [64]:
def remove_noise(text):
    '''Remove unwanted characters while preserving important features for spam/ham classification'''

    if not isinstance(text, str) or pd.isna(text):
        return ''
    
    # 1. Decode HTML entities like &amp;, &lt;, &gt;
    text = html.unescape(text)

    # 2. Remove HTML tags
    text = re.sub(r'<[^>]*>', ' ', text)

     # 3. Tangkap emotikon berbasis teks
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P|d)', text)

    # 3. Clean Unicode/BOM characters
    text = text.replace('\ufeff', '')  # Hapus BOM secara spesifik
    text = re.sub(r'[\u200b\u200c\u200d\x00-\x1f\x7f-\x9f]', '', text)

    # 4. Normalisasi full-width ke half-width (ASCII)
    text = unicodedata.normalize('NFKC', text)
    
    # 5. Replace emails with token
    text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', ' EMAILADDRESS ', text)

    # 6. Gabungkan huruf-huruf terpisah jika mereka terlihat seperti URL
    text = re.sub(r'\b(?:h\s*t\s*t\s*p(?:\s*s)?)\s*[:]\s*/\s*/(?:\s*\w\s*)+', 
              lambda m: m.group(0).replace(' ', ''), text)

    # 7. Hapus spasi di sekitar titik (untuk domain)
    text = re.sub(r'\s*\.\s*', '.', text)

    # 8. Handling URL
    # Ganti semua jenis URL dengan 'url'
    url_regex = re.compile(
      r"""(?i)\b((?:https?|ftp):\/\/|www\.)[^\s'"<>]+""",
      re.VERBOSE
    )
    text = url_regex.sub(' url ', text)
    text = re.sub(r'\bwatch\?v=[\w-]+(?:&[\w=]*)*', 'url', text)  #  Ganti juga pola YouTube jika ada
    domain_regex = re.compile(
       r'\b([a-zA-Z0-9-]{2,})\s*\.\s*(com|net|org|id|co|uk|info|biz|io|me|tv|cc|asia|dev|app|xyz)\b',
        re.IGNORECASE
    )  #Ganti pola domain terpisah
    text = domain_regex.sub('url', text)
    obfuscated_url_pattern = re.compile(
        r'\b(?:h\s*t\s*t\s*p|h\s*t\s*t\s*p\s*s)\s*[:]\s*/\s*/(?:\s*\w\s*)+',
        re.IGNORECASE
    )
    text = obfuscated_url_pattern.sub('url', text)

    # 9. Process emoji first
    text = combine_emojis(text)
    
    # 10. Hapus string mirip hash (MD5/SHA, hex 32+ karakter)
    text = re.sub(r'\b[a-fA-F0-9]{32,}\b', ' ', text)

    # 11. Hapus alfanumerik acak minimal 15 karakter yang mengandung huruf + angka
   #text = re.sub(r'\b(?=.*[a-zA-Z])(?=.*\d)[a-zA-Z0-9]{15,}\b', ' ', text)

    text = re.sub(r'\b[A-Za-z0-9_]{15,}\b', ' ', text)

    # 12. Handling numbers:
    # --- Lindungi angka dengan satuan penting ---
    text = re.sub(r'\b\d+(?:gb|mb|kb|tb|billion|million|k|m|yr|year|min|sec|dollar|usd|bit|hz|mp|mm|cm|inch|in)\b',
              'PROTECTED_NUMERIC', text, flags=re.IGNORECASE)
    # Replace ordinal numbers (1st, 2nd, 3rd, etc.)
    text = re.sub(r'\b\d+(st|nd|rd|th)\b', 'numeric', text)
    # Ganti angka 4+ digit (tahun, kode, dll)
    text = re.sub(r'\b\d{4,}\b', 'numeric', text)
    # Ganti angka kecil 1–3 digit
    text = re.sub(r'\b\d{1,3}\b', 'numeric', text)
    # Ganti notasi scientific (3x10^8)
    text = re.sub(r'\b\d+x10\^\d+\b', 'numeric', text)
    # Pulihkan angka yang sebelumnya dilindungi
    text = text.replace('PROTECTED_NUMERIC', 'numeric')

    # Buang kurung siku tapi pertahankan isinya
    text = re.sub(r'\[(.*?)\]', r'\1', text)

    # --- 13. Clean special characters but keep apostrophes in words ---
    # Buang apostrof jika di awal/akhir kata (misal `' hello` → ` hello`)
    text = re.sub(r"(?<![a-zA-Z])'|'(?![a-zA-Z])", ' ', text)

    # Buang semua karakter selain huruf, angka, spasi, underscore, dan apostrof dalam kata
   #text = re.sub(r'[^\w\s\']', ' ', text)
    # Buang semua karakter non-ASCII kecuali huruf, angka, spasi, dan apostrof
    text = re.sub(r"[^a-zA-Z0-9\s']", ' ', text)

    # Gabungkan simbol berulang (misal `!!!` → `!`)
    text = re.sub(r'(\W)\1+', r'\1', text)

    # Buang underscore yang berdiri sendiri
    text = re.sub(r'(?<!\w)_+(?!\w)', '', text)

    # --- 11. Fix spam-like separated characters ---
    # Gabungkan huruf-huruf dipisah tanda hubung: "d-d-d" → "ddd"
    text = re.sub(r'\b([a-zA-Z])-([a-zA-Z])-([a-zA-Z]+(?:-[a-zA-Z])*)\b',
              lambda m: m.group(0).replace('-', ''), text)

    # Gabungkan huruf-huruf terpisah spasi: "s p a m" → "spam"
    text = re.sub(r'\b(?:[a-zA-Z]\s+){2,}[a-zA-Z]\b',
              lambda m: m.group(0).replace(' ', ''), text)

    # --- 12. Normalize character repetitions ---
    # Huruf berulang 3+ kali jadi 2 saja (e.g., cooool → cool)
    text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', text)

    # Hahaha → haha
    text = re.sub(r'\b(ha){2,}\b', 'haha', text)
    
    # 14. Consolidate multiple instances of tokens
    text = re.sub(r'\b(numeric|url|EMAILADDRESS)(\s+\1)+\b', r'\1', text)
    
    # 7. Tambahkan kembali emotikon teks (hilangkan tanda '-')
    text = text + ' ' + ' '.join(emoticons).replace('-', '')
    
    # 15. Final whitespace cleanup
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [118]:
print(remove_noise1('I am now going to voyage to the first comment... Tell my family I loved them. 😢'))

I am now going to voyage to the first coment Tel my family I loved them crying face


In [9]:
def filter_tokens(tokens):
    """Post-processing untuk filter token hasil"""
    # Kata pendek yang valid
    valid_short = {
        'i', 'a', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 
        'he', 'hi', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 
        'ok', 'on', 'or', 'so', 'to', 'up', 'us', 'we', 'oh', 'id'
    }
    
    filtered = []
    for token in tokens:
        # Skip token kosong
        if not token or token.isspace():
            continue
            
        # Skip token terlalu pendek kecuali valid
        if len(token) <= 2 and token not in valid_short:
            continue
            
        # Skip jika hanya underscore atau angka
        if token.replace('_', '').replace('-', '').isdigit():
            continue
            
        # Skip token yang mencurigakan (no vowel, terlalu random)
        if len(token) > 5 and not any(c in 'aeiou' for c in token):
            continue
            
        filtered.append(token)
    
    return filtered

### 1.2.5 Tokenisasi

In [10]:
def word_tokenization(text):
  return word_tokenize(text)

### 1.2.6 Stopwords Removal

In [11]:
def get_selective_stopwords():
    """
    Ambil NLTK stopwords, lalu remove kata-kata penting untuk spam detection
    """
    # Ambil semua NLTK stopwords
    nltk_stops = set(stopwords.words("english"))
    
    # Kata-kata yang HARUS DIPERTAHANKAN untuk spam detection
    keep_for_spam = {
        # Negation (SANGAT PENTING)
        "not", "no", "don't", "won't", "can't", "haven't", "hasn't", 
        "hadn't", "isn't", "aren't", "wasn't", "weren't", "wouldn't", 
        "couldn't", "shouldn't", "mustn't", "needn't", "mightn't", "ain't",
        
        # Personal targeting
        "you", "your", "yours", "yourself", "yourselves", "me", "my", 
        "mine", "myself", "i", "i'm", "i've", "i'll", "i'd",
        
        # Urgency & Persuasion
        "now", "only", "just", "will", "would", "should", "must", 
        "more", "most", "all", "any", "some", "few", "many", "much",
        
        # Modality
        "can", "could", "may", "might", "shall",
        
        # Intensifiers
        "very", "too", "so", "quite", "rather",
        
        # Frequency
        "always", "never", "often", "sometimes", "usually", "again", "once"
    }
    
    # Selective stopwords = NLTK stopwords - kata penting spam
    selective_stops = nltk_stops - keep_for_spam
    
    return selective_stops

def remove_stopwords3(text):
    """
    Selective stopword removal menggunakan modified NLTK stopwords
    """
    selective_stops = get_selective_stopwords()
    
    if isinstance(text, list):
        filtered_text = [w for w in text if w.lower() not in selective_stops]
        return filtered_text
    
    elif isinstance(text, str):
        tokens = text.split()
        filtered_tokens = [w for w in tokens if w.lower() not in selective_stops]
        return ' '.join(filtered_tokens)
    
    else:
        return text

In [12]:
def remove_stopwords(text): 
    """Menghapus stopword dari list kata yang diberikan."""
    stops = set(stopwords.words("english"))
    
    if isinstance(text, list):
        text = [w for w in text if w.lower() not in stops]
        
    return text

In [None]:
stops = set(stopwords.words("english"))
print(stops)

### 1.2.7 Stemming

In [13]:
# Inisialisasi stemmer untuk bahasa Inggris
stemmer = SnowballStemmer("english")

# Fungsi untuk stemming setiap kata dalam list
def stemmed_wrapper(document): 
    return [stemmer.stem(term) for term in document]

### 1.2.9 Normalization

In [85]:
token_normalization_dict = {
    # 1. CONTRACTIONS EXPANSION
    "n't": 'not',
    "'ll": 'will',
    "'re": 'are',
    "'ve": 'have',
    "'m": 'am',
    "dont": 'do not',
    "doesnt": 'does not',
    "didnt": 'did not',
    "wont": 'will not',
    "cant": 'cannot',
    "isnt": 'is not',
    "wouldnt": 'would not',
    "thats": 'that is',
    "whats": 'what is',
    "ive": 'i have',
    "hes": 'he is',
    "shes": 'she is',
    "its": 'it is',
    "ill": 'i will',
    "im": 'i am',
    "youre": 'you are',
    "theyre": 'they are',
    "weve": 'we have',
    "youve": 'you have',
    "theres": 'there is',
    "heres": 'here is',
    "itz": 'it is',
    "y'all": 'you all',
    "yall": 'you all',
    "i'am": 'i am',
    
    # 2. LAUGH VARIATIONS
    'laugh': 'laugh',
    'haha': 'laugh',
    'hahah': 'laugh',
    'hahaa': 'laugh',
    'hahahahah': 'laugh',
    'lol': 'laugh',
    'lool': 'laugh',
    'roar': 'laugh',
    'roaarr': 'laugh',
    'roarr': 'laugh',
    'lmfao': 'laugh',
    'lmfaoi': 'laugh',
    'lmfaois': 'laugh',
    'lmfaovevo': 'laugh',
    'lmfao': 'laugh',
    'rofl': 'laugh',

    # Slang & Internet Speak
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "idk": "i don't know",
    "idc": "i don't care",
    "brb": "be right back",
    "btw": "by the way",
    "omg": "oh my god",
    "omfg": "oh my freaking god",
    "fyi": "for your information",
    "smh": "shaking my head",
    "tbh": "to be honest",
    "jk": "just kidding",
    "nvm": "never mind",
    "np": "no problem",
    "afaik": "as far as i know",
    "asap": "as soon as possible",
    "atm": "at the moment",
    "rn": "right now",
    "wbu": "what about you",
    "hbu": "how about you",
    "ikr": "i know right",
    "omw": "on my way",
    "tmi": "too much information",
    "bff": "best friend forever",
    "bf": "boyfriend",
    "gf": "girlfriend",
    "ily": "i love you",
    "ilu": "i love you",
    "ilu2": "i love you too",
    "ilysm": "i love you so much",
    "xoxo": "hugs and kisses",
    "wtf": "what the fuck",
    "wth": "what the hell",
    "idgaf": "i do not give a fuck",
    "ffs": "for fuck's sake",
    "fml": "fuck my life",


    # 3. LOVE VARIATIONS
    'love': 'love',
    'lovee': 'love',
    'loveee': 'love',
    'loove': 'love',
    'loovee': 'love',
    'loovve': 'love',
    'loovvee': 'love',
    'lova': 'love',
    'lovet': 'love',
    'luv': 'love',
    'ilove': 'love',
    'ilovethissong': 'love',
    'lovethewayyoulie': 'love',
    
    # 4. LIKE VARIATIONS
    'like': 'like',
    'likee': 'like',
    'llike': 'like',
    'llikee': 'like',
    'lik': 'like',
    'likkee': 'like',
    
    # 5. SUBSCRIBE VARIATIONS
    'subscribe': 'subscribe',
    'sub': 'subscribe',
    'subscribee': 'subscribe',
    'subscrib': 'subscribe',
    'subcribe': 'subscribe',
    'suscribe': 'subscribe',
    'suscriba': 'subscribe',
    'suscríbase': 'subscribe',
    'subscribirse': 'subscribe',
    'wilsubscribe': 'subscribe',
    'subscribl': 'subscribe',
    'subscrible': 'subscribe',
    'sucscribe': 'subscribe',
    'suscribite': 'subscribe',
    'subscriber': 'subscribe',
    'subscribers': 'subscribe',
    'subscription': 'subscribe',
    
    # 6. PLEASE VARIATIONS
    'please': 'please',
    'pleas': 'please',
    'pleaase': 'please',
    'plese': 'please',
    'plz': 'please',
    'pls': 'please',
    'plzz': 'please',
    'plizz': 'please',
    
    # 7. THANKS VARIATIONS
    'thanks': 'thanks',
    'thankss': 'thanks',
    'thx': 'thanks',
    'thnx': 'thanks',
    'thankful': 'thanks',
    'ty': 'thanks',
    'tyvm': 'thanks',
    'tqvm': 'thanks',
    
    # 8. VIDEO VARIATIONS
    'video': 'video',
    'videos': 'video',
    'vidio': 'video',
    'vidios': 'video',
    'videoes': 'video',
    'vid': 'video',
    'vidz': 'video',
    'vídeo': 'video',
    "video'sdi": 'video',
    'vids': 'video',

    # 9. MUSIC VARIATIONS
    'music': 'music',
    'muzik': 'music',
    'miusic': 'music',
    'musique': 'music',
    
    # 10. AWESOME VARIATIONS
    'awesome': 'awesome',
    'awsome': 'awesome',
    'awesom': 'awesome',
    'awesomee': 'awesome',
    'awesoome': 'awesome',
    'awesum': 'awesome',
    'awesomeness': 'awesome',
    
    # 11. GOOD VARIATIONS
    'good': 'good',
    'goo': 'good',
    'goot': 'good',
    'goodlooking': 'good looking',
    
    # 12. NICE VARIATIONS
    'nice': 'nice',
    'nicee': 'nice',
    'nicei': 'nice',
    'nicer': 'nice',
    
    # 13. BEAUTIFUL VARIATIONS
    'beautiful': 'beautiful',
    'beutiful': 'beautiful',
    'beautifull': 'beautiful',
    'beauty': 'beautiful',
    'beaties': 'beautiful',
    
    # 14. GREAT VARIATIONS
    'great': 'great',
    'greatest': 'great',
    
    # 15. MONEY VARIATIONS
    'money': 'money',
    'moneyz': 'money',
    
    # 16. YOUTUBE VARIATIONS
    'youtube': 'youtube',
    'youtu': 'youtube',
    'youtub': 'youtube',
    'yt': 'youtube',
    "watchin": "watching",
    "goin": "going",
    "doin": "doing",
    "runnin": "running",
    "comin": "coming",
    "uploading": "upload",
    "remixes": "remix",
    "singer": "sing",
    "songs": "song",
    "soundsofsunday": "sound",
    "spamming": "spam",
    "reading": "read",
    "reads": "read",
    "readed": "read",


    # 17. CHANNEL VARIATIONS
    'channel': 'channel',
    'channels': 'channel',
    'channell': 'channel',
    'chhanel': 'channel',
    
    # 18. FRIEND VARIATIONS
    'friend': 'friend',
    'friends': 'friend',
    'freind': 'friend',
    'freinds': 'friend',
    
    # 19. GIRL VARIATIONS
    'girl': 'girl',
    'girls': 'girl',
    'girly': 'girl',
    
    # 20. PEOPLE VARIATIONS
    'people': 'people',
    'peoples': 'people',
    'poeple': 'people',
    
    # 21. REALLY VARIATIONS
    'really': 'really',
    'realy': 'really',
    
    # 22. DEFINITELY VARIATIONS
    'definitely': 'definitely',
    'definitley': 'definitely',
    'definitily': 'definitely',
    
    # 23. CONGRATULATIONS VARIATIONS
    'congratulations': 'congratulations',
    'congrats': 'congratulations',
    'congrasulation': 'congratulations',
    
    # 24. FUNNY VARIATIONS
    'funny': 'funny',
    'funnier': 'funny',
    
    # 25. SORRY VARIATIONS
    'sorry': 'sorry',
    'sore': 'sorry',
    
    # 26. SOON VARIATIONS
    'soon': 'soon',
    'soo': 'soon',
    'soong': 'soon',
    
    # 27. JUST VARIATIONS
    'just': 'just',
    'jus': 'just',
    'juss': 'just',
    'justing': 'just',
    
    # 28. WEIRD VARIATIONS
    'weird': 'weird',
    'wierd': 'weird',
    
    # 29. BILLION/MILLION VARIATIONS
    'billion': 'billion',
    'billions': 'billion',
    'bilion': 'billion',
    'billon': 'billion',
    'million': 'million',
    'millions': 'million',
    'milion': 'million',
    'milions': 'million',
    'millioon': 'million',
    'millon': 'million',
    
    # 30. YEAR VARIATIONS
    'year': 'year',
    'years': 'year',
    'yeat': 'year',
    "yr": "year",

    # 31. WELCOME VARIATIONS
    'welcome': 'welcome',
    'wellcome': 'welcome',
    'wellcomemd': 'welcome',
    
    # 32. OFFICIAL VARIATIONS
    'official': 'official',
    'offical': 'official',
    'offıcal': 'official',
    
    # 33. EMOJI NORMALIZATION
    'red_heart': 'love_emoji',
    'heart_suit': 'love_emoji',
    'purple_heart': 'love_emoji',
    'growing_heart': 'love_emoji',
    'beating_heart': 'love_emoji',
    'sparkling_heart': 'love_emoji',
    'smiling_face_with_heart': 'love_emoji',
    'smiling_cat_with_heart': 'love_emoji',
    'face_blowing_a_kiss': 'love_emoji',
    'kissing_face': 'love_emoji',
    'kissing_face_with_closed_eyes': 'love_emoji',
    'kissing_face_with_smiling_eyes': 'love_emoji',
    
    'smiling_face': 'happy_emoji',
    'smiling_face_with_smiling_eyes': 'happy_emoji',
    'grinning_face': 'happy_emoji',
    'grinning_face_with_smiling_eyes': 'happy_emoji',
    'grinning_face_with_big_eyes': 'happy_emoji',
    'beaming_face_with_smiling_eyes': 'happy_emoji',
    'grinning_squinting_face': 'happy_emoji',
    'grinning_face_with_sweat': 'happy_emoji',
    'face_with_tears_of_joy': 'happy_emoji',
    'smiling_face_with_sunglasses': 'happy_emoji',
    'smiling_face_with_halo': 'happy_emoji',
    
    'crying_face': 'sad_emoji',
    'loudly_crying_face': 'sad_emoji',
    'sad_but_relieved_face': 'sad_emoji',
    'disappointed_face': 'sad_emoji',
    'downcast_face_with_sweat': 'sad_emoji',
    'weary_face': 'sad_emoji',
    'tired_face': 'sad_emoji',
    'sleepy_face': 'sad_emoji',
    'pensive_face': 'sad_emoji',
    
    'angry_face': 'angry_emoji',
    'angry_face_with_horns': 'angry_emoji',
    'enraged_face': 'angry_emoji',
    
    # 34. OTHER COMMON VARIATIONS
    'amaze': 'amazing',
    'amazing': 'amazing',
    'amazement': 'amazing',
    'amazed': 'amazing',
    'amazes': 'amazing',
    
    'incredible': 'incredible',
    'increible': 'incredible',
    'increidebl': 'incredible',
    
    'picture': 'picture',
    'pictures': 'picture',
    'pic': 'picture',
    
    'ok': 'okay',
    'okay': 'okay',
    
    'cool': 'cool',
    'col': 'cool',
    
    'stupid': 'stupid',
    'sttuupid': 'stupid',
    
    'damn': 'damn',
    'damnn': 'damn',
    
    'shit': 'shit',
    'shiit': 'shit',
    
    'fuck': 'fuck',
    'fucken': 'fuck',
    'fack': 'fuck',
    
    'wow': 'wow',
    'woww': 'wow',
    
    'hi': 'hello',
    'hello': 'hello',
    'hey': 'hello',
    'hii': 'hello',
    'hiya': 'hello',
    
    'bye': 'goodbye',
    'goodbye': 'goodbye',
    
    'ya': 'you',
    'u': 'you',
    'ur': 'your',
    
    'yea': 'yeah',
    'yeah': 'yeah',
    'yep': 'yeah',
    'yes': 'yes',
    
    'ahh': 'ah',
    'ahhh': 'ah',
    
    'oh': 'oh',
    'ohh': 'oh',
    'ohhh': 'oh',
    
    'umm': 'um',
    'ummm': 'um',
    
    "app": "app",
    "apps": "app",
    "application": "app",
    "applications": "app",
    "fb": "facebook",
    "ig": "instagram",
    "faq": "frequently asked questions",
    "ftw": "for the win",
    "gg": "good game",
    "b4": "before",
    "bc": "because",
    "bcoz": "because",
    "bcuz": "because",
    "bcause": "because",
    "bcz": "because",
    "cuz": "because",
    "coz": "because",
    "txt": "text",
    "msg": "message",
    "convo": "conversation",
    "sec": "second",

    # 35. ACTIVITY EMOJI NORMALIZATION
    'person_surfing': 'activity_emoji',
    'person_swimming': 'activity_emoji',
    'snowboarder': 'activity_emoji',
    'person_wearing_turban': 'person_emoji',
    
    # 36. FOOD EMOJI NORMALIZATION
    'fork_and_knife': 'food_emoji',
    'pizza': 'food_emoji',
    'hamburger': 'food_emoji',
    'french_fries': 'food_emoji',
    'poultry_leg': 'food_emoji',
    'hot_beverage': 'food_emoji',
    'cocktail_glass': 'food_emoji',
    'tropical_drink': 'food_emoji',
    'teacup_without_handle': 'food_emoji',
    
    # 37. OBJECT EMOJI NORMALIZATION
    'police_car_light': 'object_emoji',
    'fishing_pole': 'object_emoji',
    'trophy': 'object_emoji',
    'clinking_beer_mugs': 'object_emoji',
    'gem_stone': 'object_emoji',
    'lipstick': 'object_emoji',
    'musical_notes': 'music_emoji',
    'musical_note': 'music_emoji',
    'musical_score': 'music_emoji',
    
    # 38. NATURE EMOJI NORMALIZATION
    'rainbow': 'nature_emoji',
    'tiger': 'tiger',
    'tiger_face': 'tiger',
    
    # 39. SYMBOL EMOJI NORMALIZATION
    'thumbs_up': 'positive_emoji',
    'victory_hand': 'positive_emoji',
    'clapping_hands': 'positive_emoji',
    'open_hands': 'positive_emoji',
    'raising_hands': 'positive_emoji',
    'kiss_mark': 'love_emoji',
    'right_arrow_curving_up': 'arrow_emoji',

    # Contractions / Spoken Forms
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "hafta": "have to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "sorta": "sort of",
    "outta": "out of",
    "lotta": "a lot of",
    "dunno": "do not know",
    "ain't": "is not",
    "wassup": "what is up",
    "wazzup": "what is up",
    "sup": "what is up",
    "cya": "see you",
    "cu": "see you",
    "cu2moro": "see you tomorrow",
    "bbl": "be back later",
    "hmu": "hit me up",
    "nm": "not much",
    "ppl": "people",
    "oml": "oh my lord",
    "nuthin": "nothing",
    "wat": "what",
    "wut": "what",
    "wyd": "what are you doing",

    # Name typos / normalization
    "eminems": "eminem",
    "rihana": "rihanna",
    "chanel": "channel",
    "chanell": "channel",
    "channell": "channel",

    # Entertainment context
    "entertainer": "entertain",
    "entertainment": "entertain",
    "entertaining": "entertain",
    "entertains": "entertain",
    "entertained": "entertain",

    # British → American English
    "favourite": "favorite",
    "fave": "favorite",
    "fav": "favorite",
    "mum": "mother",
    "mummy": "mother",
    "mom": "mother",
    "mommy": "mother",

    # Positive traits
    "talented": "talent",
    "swagfriends": "swag",
    "swag": "swag",
    "successful": "success",
    "survival": "survive",

    # Misc
    "gr8": "great",
    "true": "true",
    "thumbs": "thumb",
    "foward": "forward",
    "tryna": "try",
    "web": "website",

    
    # 40. NOISE TOKENS TO REMOVE
    '0lneadw26bftvzqt6juehasiefrjg1exi_dvqdnqvpho': '',
    '0lneadw26bfunoarag71awgu6tjo6azdkfiun_tz1_hy': '',
    'a0qouc7q48v3_qiaabpugaacsqar0_vgoqwqxjmpuyvkosf3k': '',
    'q1bvkf9ze4jhnc3ovckkxcbafzzpajibxwbvvq4jrdgz8q3rinlwgaby_bxlfw7ma6dk0rjg14zkryizwqdi7hxgge9tndd9abfltfkbmbffcjixnthwbwkj6n2onlh2d9eveagphoewxogbnu5ibgtrkgnacq1oibgmzgafnsc0lsariqj8hqr8t12dwv_7biy4k6i3y4yublotde_4xvklnveadzzf1l_xryqke6wsur3edljwgk8flq_qaldi': '',
    '0dbhjzdw0lbsjbi40gxm0d0p5krhv8xinqli53__wqbahs8zx4mjhw5vwrkpxfoeks': '',
    '강남스타일': '',
    '2x10': '',
    '26t22': '',
    '1bsefq ': '',
    '1bi': '',
    '14gkvdo': '',
    '0lneadw26bft': '',
    "a'n": '',
    "4000dollar": "numeric dollar",
    '2012bitch': 'numeric bitch',
    '1billiom': 'numeric billion',
    "36loseweight": "",
    "2012bitch": '',
    "1bsefq": ''
    }

In [86]:
# Kamus bantuan dari XuanyiZ/Text-Normalization
new_normalization_dict = {}
with open('C:\\Users\\ASUS\\Documents\\Tugas Akhir\\data\\Test_Set_3802_Pairs.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        parts = line.split('\t')
        if len(parts) >= 2:
            # Sesuaikan dengan format file Anda.
            # Jika ada frekuensi di awal, gunakan parts[1] dan parts[2]
            if parts[0].isdigit(): 
                non_standard = parts[1]
                standard = parts[2]
            # Jika tidak ada frekuensi, gunakan parts[0] dan parts[1]
            else:
                non_standard = parts[0]
                standard = parts[1]
            new_normalization_dict[non_standard] = standard


In [87]:

# LANGKAH 2: Gabungkan kedua kamus (INI YANG PENTING!)
combined_dict = new_normalization_dict.copy()  # Copy kamus lama
combined_dict.update(token_normalization_dict)     # Tambahkan kamus baru

# LANGKAH 3: Ubah fungsi normalize_tokens Anda
def normalize_tokens(tokens):
    """Normalize tokens menggunakan kamus gabungan"""
    normalized = []
   
    for token in tokens:
        # Skip empty tokens
        if not token or token.isspace():
            continue
           
        # Gunakan kamus gabungan (HANYA INI YANG BERUBAH!)
        if token.lower() in combined_dict:
            replacement = combined_dict[token.lower()]
            if replacement:
                normalized.append(replacement)
        else:
            normalized.append(token.lower())
   
    return normalized

In [88]:
print(combined_dict)

{'0kkay': 'okay', '0n': 'on', '0neee': 'one', '0r': 'or', '1s': 'once', '2daii': 'today', '2day': 'today', "2day's": "today's", '2gether': 'together', '2marro': 'tomorrow', '2moro': 'tomorrow', '2morro': 'tomorrow', '2morrow': 'tomorrow', '2moz': 'tomorrow', '2mz': 'tomorrow', '2nd': 'second', '2niiqht': 'tonight', '2nite': 'tonight', '2nyt': 'tonight', '2wo': 'tomorrow', '3s': 'threes', '4evaa': 'forever', '4ever': 'forever', '4got': 'forgot', '4rm': 'from', '5ay': 'say', '5ayin': 'saying', '5o': 'so', '5th': 'fifth', '5top': 'stop', 'aa': 'a', 'aaaand': 'and', 'aaaaw': 'anyway', 'aaalll': 'all', 'aand': 'and', 'abole': 'able', 'abooout': 'about', 'absolotuely': 'absolutely', 'absolutly': 'absolutely', 'abt': 'about', 'accross': 'across', 'acct': 'account', 'achivements': 'achievements', 'acount': 'account', 'actin': 'acting', 'actn': 'action', 'actt': 'act', 'actualy': 'actually', 'addaction': 'addiction', 'addin': 'adding', 'addiquate': 'adequate', 'addres': 'address', 'addy': 'addr

In [None]:
def normalize_tokens(tokens):
    """Normalize tokens menggunakan dictionary"""
    normalized = []
    
    for token in tokens:
        # Skip empty tokens
        if not token or token.isspace():
            continue
            
        # Check if token needs normalization
        if token.lower() in token_normalization_dict:
            replacement = token_normalization_dict[token.lower()]
            if replacement:  # Only add if not empty string
                normalized.append(replacement)
        else:
            normalized.append(token.lower())
    
    return normalized

## 1.2 Representasi Teks

### 1.2.1 BoW Biner

In [None]:
# ===== 1. Bangun Kosakata =====
def build_vocabulary(documents, min_freq=1):
    word_counts = Counter()
    for doc in documents:
        word_counts.update(doc)
    vocab = {word for word, count in word_counts.items() if count >= min_freq}
    vocab = sorted(vocab)
    vocab_dict = {word: idx for idx, word in enumerate(vocab)}
    return vocab_dict

# ===== 2. Transformasi BoW Biner =====
def bow_binary_transform(documents, vocabulary):
    n_docs = len(documents)
    vocab_size = len(vocabulary)
    bow_matrix = np.zeros((n_docs, vocab_size), dtype=int)

    for i, doc in enumerate(documents):
        for word in set(doc):
            if word in vocabulary:
                bow_matrix[i, vocabulary[word]] = 1
    return bow_matrix

# ===== 3. Ambil Nama Fitur dari Vocab =====
def get_feature_names(vocabulary):
    return list(vocabulary.keys())

### 1.2.2 TF-IDF

In [None]:
# 1. Ambil label
labels = df_final['CLASS']

# 2. Gabungkan token menjadi string jika masih dalam bentuk list
df_final['content_str'] = df_final['content_stemm'].apply(
    lambda tokens: ' '.join(tokens) if isinstance(tokens, list) else str(tokens)
)

# 3. Inisialisasi TF-IDF Vectorizer dengan parameter opsional
vectorizer = TfidfVectorizer(
    stop_words='english',  # Ganti dengan 'indonesian' jika punya stopword list sendiri
    max_features=1000,     # Batasi jumlah fitur jika terlalu besar
    min_df=2,              # Hanya kata yang muncul di ≥2 dokumen
    max_df=0.95            # Hilangkan kata terlalu umum
)

# 4. Fit dan transform teks
tfidf_matrix = vectorizer.fit_transform(df_final['content_str'])

print(f"Shape of the TF-IDF matrix: {tfidf_matrix.shape}")

# 5. Konversi ke DataFrame
df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out(),
    index=df_final.index
)

# 6. Gabungkan TF-IDF dengan label
df_tfidf_labeled = pd.concat([df_tfidf, labels], axis=1)

In [None]:
# Gabungkan token dalam setiap list menjadi satu string
df_final['content_stemm_str'] = df_final['content_stemm'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, list) else str(tokens))

In [None]:
# Asumsikan df_final sudah ada dan memiliki kolom 'content_lemma_str' dan 'CLASS'

from sklearn.feature_extraction.text import TfidfVectorizer

# Membuat objek TfidfVectorizer
tf = TfidfVectorizer()

# Menghitung matriks TF-IDF
tfidf_matrix = tf.fit_transform(df_final['content_stemm_str'].values)

# Konversi ke dense matrix dan buat DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.todense(), 
                        columns=tf.get_feature_names_out(), 
                        index=df_final.index)

# Tambahkan kolom CLASS
df_tfidf['CLASS'] = df_copy['CLASS']

# Tampilkan hasil akhir
#print(df_tfidf.head())
print("Jumlah fitur (kata unik) TF-IDF:", len(tf.get_feature_names_out()))




In [None]:
print(df_tfidf_labeled.head())
print(df_tfidf_labeled.columns[-5:])  # Lihat kolom terakhir (biasanya 'CLASS' ada di sana)
print(df_tfidf_labeled['CLASS'].value_counts())  # Cek distribusi label

# 2. EDA

In [26]:
df = load_csv_data('../data/Youtube-Spam-Dataset.csv',
                   text_column='CONTENT',
                   label_column='CLASS',
                   encoding='utf-8')

Data berhasil diload: 1956 sampel
Distribusi kelas:
CLASS
1    1005
0     951
Name: count, dtype: int64


In [27]:
display_data_overview(df, target_column='CLASS')

DATASET OVERVIEW
Shape: 1956 rows × 6 columns
Memory Usage: 0.99 MB
Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, VIDEO_NAME, CLASS

DATA TYPES
COMMENT_ID: object
AUTHOR: object
DATE: object
CONTENT: object
VIDEO_NAME: object
CLASS: int64

MISSING VALUES
COMMENT_ID: 0 (0.0%)
AUTHOR: 0 (0.0%)
DATE: 245 (12.53%)
CONTENT: 0 (0.0%)
VIDEO_NAME: 0 (0.0%)
CLASS: 0 (0.0%)

TARGET VARIABLE: CLASS
CLASS
1    1005
0     951
Name: count, dtype: int64

Class Distribution:
CLASS
1    51.38
0    48.62
Name: count, dtype: float64

SAMPLE DATA (First 5 rows)
                                    COMMENT_ID            AUTHOR  \
0  LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   
1  LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   
2  LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   
3          z13jhp0bxqncu512g22wvzkasxmvvzjaz04   ElNino Melendez   
4          z13fwbwp1oujthgqj04chlngpvzmtt3r3dw            GsMega   

                  DATE                       

In [28]:


# Atur pandas agar menampilkan semua baris
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Tampilkan seluruh kolom CONTENT
df[['CONTENT']]

Unnamed: 0,CONTENT
0,"Huh, anyway check out this you[tube] channel: kobyoshi02"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!"
2,just for test I have to say murdev.com
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿
4,watch?v=vtaRGgvGtWQ Check this out .﻿
5,"Hey, check out my new website!! This site is about kids stuff. kidsmediausa . com"
6,Subscribe to my channel ﻿
7,i turned it on mute as soon is i came on i just wanted to check the views...﻿
8,You should check my channel for Funny VIDEOS!!﻿
9,and u should.d check my channel and tell me what I should do next!﻿


In [29]:
df_content = df[['CONTENT', 'CLASS']]

## Data Awal

In [30]:
display_data_overview(df_content, target_column='CLASS')

DATASET OVERVIEW
Shape: 1956 rows × 2 columns
Memory Usage: 0.57 MB
Columns: CONTENT, CLASS

DATA TYPES
CONTENT: object
CLASS: int64

MISSING VALUES
CONTENT: 0 (0.0%)
CLASS: 0 (0.0%)

TARGET VARIABLE: CLASS
CLASS
1    1005
0     951
Name: count, dtype: int64

Class Distribution:
CLASS
1    51.38
0    48.62
Name: count, dtype: float64

SAMPLE DATA (First 5 rows)
                                                                                                                                                                  CONTENT  \
0                                                                                                                Huh, anyway check out this you[tube] channel: kobyoshi02   
1  Hey guys check out my new channel and our first vid THIS IS US THE  MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment  and please subscribe!!!!   
2                                                                                                                    

# 3. Data Cleaning

In [31]:
df_copy = df_content.copy()

## 3.1 Convert Lower case

In [89]:
df_copy["content_lower"] = convert_to_lowercase(df_copy['CONTENT'])

In [90]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube channel kobyoshi02,"[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new channel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like comment and please subscribe,"[hey, guys, check, out, my, new, channel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, comment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy ass on my channel enjoy,"[me, shaking, my, sexy, ass, on, my, channel, enjoy]","[me, shaking, my, sexy, ass, on, my, channel, enjoy]","[shaking, sexy, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


## 3.2 remove noise

In [98]:
df_copy["content_lower_rn"] = df_copy['content_lower'].apply(remove_noise1)

In [92]:
print(remove_noise("http://www.ebay.com/itm/171183229277?sspagename=strk:meselx:it&amp;_trksid=p3984.m1555.l2649 ï»¿"))

url


In [99]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube chanel kobyoshi02,"[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new chanel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like coment and please subscribe,"[hey, guys, check, out, my, new, channel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, comment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy as on my chanel enjoy ^_^,"[me, shaking, my, sexy, ass, on, my, channel, enjoy]","[me, shaking, my, sexy, ass, on, my, channel, enjoy]","[shaking, sexy, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


## 3.3 Tokenisasi

In [103]:
df_copy["content_tokenized"] = df_copy['content_lower_rn'].apply(word_tokenization)

In [104]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube chanel kobyoshi02,"[huh, anyway, check, out, this, youtube, chanel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new chanel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like coment and please subscribe,"[hey, guys, check, out, my, new, chanel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, comment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy as on my chanel enjoy ^_^,"[me, shaking, my, sexy, as, on, my, chanel, enjoy, ^_^]","[me, shaking, my, sexy, ass, on, my, channel, enjoy]","[shaking, sexy, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


In [102]:
print((word_tokenization("me shaking my sexy as on my chanel enjoy ^_^")))

['me', 'shaking', 'my', 'sexy', 'as', 'on', 'my', 'chanel', 'enjoy', '^_^']


In [None]:
df_copy["content_filter"] = df_copy['content_tokenized'].apply(filter_tokens)

In [None]:
df_copy.head()

In [None]:
sys.exit(0)

## export data

In [None]:
df_copy.to_csv('../Hasil/Youtube-Spam-Dataset-processed-phase2.csv', index=False)

## hapus kolom

In [None]:
drop_columns = ['content_lower', 'content_lower_rn', 'normalized', 'content_tokenized', 'content_stopwords','content_filter', 'content_stemm', 'content_stemm_str', 'content_normalized', 'content_clean']
df_copy.drop(columns=drop_columns, inplace=True, errors='ignore')


## 3.5 Normalisasi

In [105]:
df_copy['content_normalized'] = df_copy['content_tokenized'].apply(normalize_tokens)

In [106]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube chanel kobyoshi02,"[huh, anyway, check, out, this, youtube, chanel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new chanel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like coment and please subscribe,"[hey, guys, check, out, my, new, chanel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, comment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy as on my chanel enjoy ^_^,"[me, shaking, my, sexy, as, on, my, chanel, enjoy, ^_^]","[me, shaking, my, sexy, as, on, my, channel, enjoy, ^_^]","[shaking, sexy, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


## 3.4 Stopwords Removal

In [107]:
df_copy['content_stopwords'] = df_copy['content_normalized'].apply(remove_stopwords)

In [108]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube chanel kobyoshi02,"[huh, anyway, check, out, this, youtube, chanel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new chanel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like coment and please subscribe,"[hey, guys, check, out, my, new, chanel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, coment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy as on my chanel enjoy ^_^,"[me, shaking, my, sexy, as, on, my, chanel, enjoy, ^_^]","[me, shaking, my, sexy, as, on, my, channel, enjoy, ^_^]","[shaking, sexy, channel, enjoy, ^_^]","[shake, sexi, ass, channel, enjoy]","[shake, sexi, ass, channel, enjoy]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


## 3.5 Stemming

In [109]:
df_copy['content_stemm'] = df_copy['content_stopwords'].apply(stemmed_wrapper)

In [110]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube chanel kobyoshi02,"[huh, anyway, check, out, this, youtube, chanel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new chanel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like coment and please subscribe,"[hey, guys, check, out, my, new, chanel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, coment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, coment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, comment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy as on my chanel enjoy ^_^,"[me, shaking, my, sexy, as, on, my, chanel, enjoy, ^_^]","[me, shaking, my, sexy, as, on, my, channel, enjoy, ^_^]","[shaking, sexy, channel, enjoy, ^_^]","[shake, sexi, channel, enjoy, ^_^]","[shake, sexi, ass, channel, enjoy]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


In [111]:
# Gabungkan token dalam setiap list menjadi satu string
df_copy['content_clean'] = df_copy['content_stemm']

In [112]:
df_copy.head()

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
0,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"huh, anyway check out this you[tube] channel: kobyoshi02",huh anyway check out this youtube chanel kobyoshi02,"[huh, anyway, check, out, this, youtube, chanel, kobyoshi02]","[huh, anyway, check, out, this, youtube, channel, kobyoshi02]","[huh, anyway, check, youtube, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]","[huh, anyway, check, youtub, channel, kobyoshi02]"
1,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"hey guys check out my new channel and our first vid this is us the monkeys!!! i'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",hey guys check out my new chanel and our first vid this is us the monkeys i'm the monkey in the white shirt please leave a like coment and please subscribe,"[hey, guys, check, out, my, new, chanel, and, our, first, vid, this, is, us, the, monkeys, i, 'm, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, out, my, new, channel, and, our, first, video, this, is, us, the, monkeys, i, am, the, monkey, in, the, white, shirt, please, leave, a, like, coment, and, please, subscribe]","[hello, guys, check, new, channel, first, video, us, monkeys, monkey, white, shirt, please, leave, like, coment, please, subscribe]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, coment, pleas, subscrib]","[hello, guy, check, new, channel, first, video, us, monkey, monkey, white, shirt, pleas, leav, like, coment, pleas, subscrib]"
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com,just for test i have to say url,"[just, for, test, i, have, to, say, url]","[just, for, test, i, have, to, say, url]","[test, say, url]","[test, say, url]","[test, say, url]"
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy as on my chanel enjoy ^_^,"[me, shaking, my, sexy, as, on, my, chanel, enjoy, ^_^]","[me, shaking, my, sexy, as, on, my, channel, enjoy, ^_^]","[shaking, sexy, channel, enjoy, ^_^]","[shake, sexi, channel, enjoy, ^_^]","[shake, sexi, channel, enjoy, ^_^]"
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿,url check this out,"[url, check, this, out]","[url, check, this, out]","[url, check]","[url, check]","[url, check]"


## 3.6 Normalization

In [None]:
# df_copy['lemma_normalized'] = df_copy['content_lemma'].apply(lambda tokens: normalize_tokens(tokens, normalization_dict))


In [None]:
# df_copy.head()

# data checking

In [113]:
empty_comment = df_copy[df_copy['content_clean'].apply(lambda x: isinstance(x, list) and len(x) == 0)]


In [114]:
empty_comment

Unnamed: 0,CONTENT,CLASS,content_lower,content_lower_rn,content_tokenized,content_normalized,content_stopwords,content_stemm,content_clean
134,❤️ ❤️ ❤️ ❤️ ❤️❤️❤️❤️﻿,0,❤️ ❤️ ❤️ ❤️ ❤️❤️❤️❤️﻿,,[],[],[],[],[]
1220,❤❤❤❤❤❤❤﻿,0,❤❤❤❤❤❤❤﻿,,[],[],[],[],[]
1538,❤️❤️❤️﻿,0,❤️❤️❤️﻿,,[],[],[],[],[]
1549,:D﻿,0,:d﻿,d,[d],[d],[],[],[]


In [None]:
df_final = df_copy.drop(index=empty_comment.index)


In [None]:
df_final.head()

In [None]:
df_final.info()

In [None]:
count_class = df_final['CLASS'].value_counts()
count_class

# 4. Representasi Teks

In [None]:
sys.exit(0)

## 4.1 BoW Biner

In [None]:
# ===== 4. Eksekusi untuk Data =====
# Ambil kolom token
documents = df_final['content_clean'].tolist()

# Bangun vocab dan transformasi
vocab = build_vocabulary(documents, min_freq=1)
X_bin = bow_binary_transform(documents, vocab)

# Buat DataFrame dari hasil transformasi
df_bow_binary = pd.DataFrame(X_bin, columns=get_feature_names(vocab))

# Gabung dengan kolom lain jika perlu
# final_df = pd.concat([data_copy, df_bow_binary], axis=1)

# Cetak hasil
print(df_bow_binary.head())

# Simpan DataFrame ke file CSV
#df_bow_binary.to_csv('../data/Youtube-Spam-Dataset-BOW-Binary5.csv', index=False)

In [None]:
# Simpan DataFrame ke file CSV
df_bow_binary.to_csv('../Hasil/Youtube-Spam-Dataset-BOW-Binary1.csv', index=False)

## 4.2 TF-IDF

In [None]:
print(df_tfidf)

In [None]:
df_tfidf.to_csv("../Hasil/Youtube-Spam-Dataset-TFIDF.csv", index=False)

In [None]:
import ast

# Ubah string literal list menjadi list Python
def parse_string_list(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

# Terapkan ke seluruh kolom
data_list = df_final['content_stemm'].astype(str).apply(parse_string_list).tolist()


print(data_list[:3])


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from typing import List, Dict

class TFIDFTableGenerator:
    """
    Generator untuk membuat tabel TF-IDF dalam format yang diminta
    """
    
    def __init__(self):
        self.vocabulary = set()
        self.total_documents = 0
        self.document_frequencies = {}
        self.term_frequencies = {}
        self.document_lengths = {}
    
    def calculate_statistics(self, data_list: List[List[str]]) -> Dict:
        """
        Menghitung statistik yang diperlukan untuk tabel TF-IDF
        """
        self.total_documents = len(data_list)
        
        # Hitung frekuensi kata per dokumen dan document frequency
        for doc_idx, tokens in enumerate(data_list):
            clean_tokens = [str(token) for token in tokens if token]
            
            # Hitung frekuensi kata dalam dokumen ini
            token_count = Counter(clean_tokens)
            self.term_frequencies[doc_idx] = token_count
            
            # Hitung panjang dokumen
            self.document_lengths[doc_idx] = len(clean_tokens)
            
            # Update vocabulary
            self.vocabulary.update(clean_tokens)
            
            # Hitung document frequency
            unique_tokens = set(clean_tokens)
            for token in unique_tokens:
                self.document_frequencies[token] = self.document_frequencies.get(token, 0) + 1
        
        return {
            'total_documents': self.total_documents,
            'vocabulary': sorted(list(self.vocabulary)),
            'document_frequencies': self.document_frequencies,
            'term_frequencies': self.term_frequencies,
            'document_lengths': self.document_lengths
        }
    
    def create_tfidf_table(self, data_list: List[List[str]], 
                          doc_labels: List[str] = None) -> pd.DataFrame:
        """
        Membuat tabel TF-IDF dalam format yang diminta
        """
        # Hitung statistik
        stats = self.calculate_statistics(data_list)
        
        # Siapkan data untuk tabel
        table_data = []
        
        # Iterasi setiap dokumen
        for doc_idx, tokens in enumerate(data_list):
            clean_tokens = [str(token) for token in tokens if token]
            
            # Label dokumen
            if doc_labels:
                doc_label = doc_labels[doc_idx]
            else:
                doc_label = f"dok{doc_idx + 1}"
            
            # Hitung frekuensi kata dalam dokumen ini
            token_count = Counter(clean_tokens)
            doc_length = len(clean_tokens)
            
            # Buat row untuk setiap kata dalam dokumen
            for word in sorted(token_count.keys()):
                freq = token_count[word]
                
                # Hitung TF (Term Frequency)
                tf = freq / doc_length if doc_length > 0 else 0
                
                # Hitung DF (Document Frequency)
                df = self.document_frequencies[word]
                
                # Hitung IDF (Inverse Document Frequency)
                idf = np.log(self.total_documents / df) if df > 0 else 0
                
                table_data.append({
                    'dokumen': doc_label,
                    'kata': word,
                    'freq': freq,
                    'jumlah_kata': doc_length,
                    'tf': tf,
                    'df': df,
                    'total_dokumen': self.total_documents,
                    'idf': idf
                })
        
        # Buat DataFrame
        df = pd.DataFrame(table_data)
        
        # Reorder kolom sesuai format yang diminta
        df = df[['dokumen', 'kata', 'freq', 'jumlah_kata', 'tf', 'df', 'total_dokumen', 'idf']]
        
        return df
    
    def create_formatted_table(self, data_list: List[List[str]], 
                              doc_labels: List[str] = None) -> pd.DataFrame:
        """
        Membuat tabel dengan format yang lebih mirip dengan gambar
        (dokumen hanya muncul di row pertama setiap dokumen)
        """
        # Buat tabel dasar
        basic_table = self.create_tfidf_table(data_list, doc_labels)
        
        # Format tabel agar dokumen hanya muncul di row pertama
        formatted_data = []
        current_doc = None
        
        for _, row in basic_table.iterrows():
            if row['dokumen'] != current_doc:
                # Row pertama dokumen baru
                formatted_data.append({
                    'A': row['dokumen'],
                    'B': row['kata'],
                    'C': row['freq'],
                    'D': row['jumlah_kata'],
                    'E': row['tf'],
                    'F': row['df'],
                    'G': row['total_dokumen'],
                    'H': row['idf']
                })
                current_doc = row['dokumen']
            else:
                # Row kata selanjutnya dalam dokumen yang sama
                formatted_data.append({
                    'A': '',  # Kosong untuk dokumen
                    'B': row['kata'],
                    'C': row['freq'],
                    'D': row['jumlah_kata'],
                    'E': row['tf'],
                    'F': row['df'],
                    'G': row['total_dokumen'],
                    'H': row['idf']
                })
        
        # Buat DataFrame dengan format yang diminta
        formatted_df = pd.DataFrame(formatted_data)
        
        return formatted_df
    
    def create_excel_format(self, data_list: List[List[str]], 
                           doc_labels: List[str] = None) -> pd.DataFrame:
        """
        Membuat tabel dalam format Excel dengan header yang sesuai
        """
        # Buat tabel dasar
        basic_table = self.create_tfidf_table(data_list, doc_labels)
        
        # Buat tabel dengan format Excel
        excel_data = []
        
        # Tambahkan header row
        excel_data.append({
            'A': '',
            'B': 'kata',
            'C': 'freq',
            'D': 'jumlah kata',
            'E': 'tf',
            'F': 'df',
            'G': 'total dokumen',
            'H': 'idf'
        })
        
        # Tambahkan data
        current_doc = None
        for _, row in basic_table.iterrows():
            if row['dokumen'] != current_doc:
                # Row pertama dokumen baru
                excel_data.append({
                    'A': row['dokumen'],
                    'B': row['kata'],
                    'C': row['freq'],
                    'D': row['jumlah_kata'],
                    'E': round(row['tf'], 4),
                    'F': row['df'],
                    'G': row['total_dokumen'],
                    'H': round(row['idf'], 4)
                })
                current_doc = row['dokumen']
            else:
                # Row kata selanjutnya dalam dokumen yang sama
                excel_data.append({
                    'A': '',
                    'B': row['kata'],
                    'C': row['freq'],
                    'D': row['jumlah_kata'],
                    'E': round(row['tf'], 4),
                    'F': row['df'],
                    'G': row['total_dokumen'],
                    'H': round(row['idf'], 4)
                })
        
        # Buat DataFrame
        excel_df = pd.DataFrame(excel_data)
        
        return excel_df
    
    def save_to_excel(self, data_list: List[List[str]], 
                     filename: str = "tfidf_table.xlsx",
                     doc_labels: List[str] = None):
        """
        Menyimpan tabel ke file Excel
        """
        excel_df = self.create_excel_format(data_list, doc_labels)
        excel_df.to_excel(filename, index=False)
        print(f"Tabel berhasil disimpan ke {filename}")
    
    def get_summary_statistics(self, data_list: List[List[str]]) -> pd.DataFrame:
        """
        Mendapatkan statistik summary dari data
        """
        stats = self.calculate_statistics(data_list)
        
        summary_data = []
        for word in sorted(stats['vocabulary']):
            df = stats['document_frequencies'][word]
            idf = np.log(stats['total_documents'] / df) if df > 0 else 0
            
            summary_data.append({
                'kata': word,
                'document_frequency': df,
                'idf': idf
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values('idf', ascending=False)
        
        return summary_df

# Contoh penggunaan untuk data penelitian skripsi
if __name__ == "__main__":
    
    # Simulasi data_list yang sudah di tokenisasi
    data_list = data_list
    # Inisialisasi generator
    generator = TFIDFTableGenerator()
    
    print("=== TABEL TF-IDF UNTUK PENELITIAN SKRIPSI ===")
    
    # 1. Tabel dasar
    print("\n1. TABEL DASAR:")
    basic_table = generator.create_tfidf_table(data_list)
    print(basic_table)
    
    # 2. Tabel dengan format Excel
    print("\n2. TABEL FORMAT EXCEL:")
    excel_table = generator.create_excel_format(data_list)
    print(excel_table)
    
    # 3. Tabel dengan format yang diminta (dokumen hanya muncul di row pertama)
    print("\n3. TABEL FORMAT YANG DIMINTA:")
    formatted_table = generator.create_formatted_table(data_list)
    print(formatted_table)
    
    # 4. Summary statistics
    print("\n4. SUMMARY STATISTICS:")
    summary = generator.get_summary_statistics(data_list)
    print(summary)
    
    print("\n=== CARA MENGGUNAKAN UNTUK DATA ANDA ===")
    print("# Untuk data_list Anda yang sudah di tokenisasi:")
    print("generator = TFIDFTableGenerator()")
    print("tabel = generator.create_tfidf_table(data_list)")
    print("formatted_tabel = generator.create_formatted_table(data_list)")
    print("generator.save_to_excel(data_list, 'hasil_tfidf.xlsx')")
    
    print("\n=== INTERPRETASI KOLOM ===")
    print("A: Dokumen ID")
    print("B: Kata/Term")
    print("C: Frekuensi kata dalam dokumen")
    print("D: Total jumlah kata dalam dokumen")
    print("E: TF (Term Frequency) = freq / jumlah_kata")
    print("F: DF (Document Frequency) = jumlah dokumen yang mengandung kata")
    print("G: Total dokumen dalam corpus")
    print("H: IDF (Inverse Document Frequency) = log(total_dokumen / df)")
    
    print("\n=== SIAP UNTUK PENELITIAN SKRIPSI ===")
    print("✓ Tabel TF-IDF dalam format yang diminta")
    print("✓ Dapat disimpan ke Excel untuk analisis")
    print("✓ Siap untuk implementasi Manhattan dan Jaccard distance")
    print("✓ Dapat digunakan untuk evaluasi akurasi KNN")

In [None]:
basic_table.to_csv('../data/Youtube-Spam-Dataset-frekuensikata.csv', index=False)

In [None]:
basic_table