# 1. Configuration


## 1.1 Import Library

In [12]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
import html
import string
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from typing import Union, Tuple, List, Dict, Any

## 1.2 Define Functions

### 1.2.1 Load Data

In [3]:
def load_csv_data(file_path: str, text_column: str, label_column: str, 
                  encoding: str = 'utf-8') -> pd.DataFrame:
    """
    Import data dari file CSV
    
    Args:
        file_path: Path ke file CSV
        text_column: Nama kolom yang berisi teks
        label_column: Nama kolom yang berisi label/kelas
        encoding: Encoding file (default: utf-8)
    
    Returns:
        DataFrame dengan data yang sudah diload
    """
    try:
        data = pd.read_csv(file_path, encoding=encoding)
        
        # Validasi kolom yang diperlukan ada
        if text_column not in data.columns:
            raise ValueError(f"Kolom '{text_column}' tidak ditemukan")
        if label_column not in data.columns:
            raise ValueError(f"Kolom '{label_column}' tidak ditemukan")
        
        # Bersihkan data dari nilai kosong
        data = data.dropna(subset=[text_column, label_column])
        
        print(f"Data berhasil diload: {len(data)} sampel")
        print(f"Distribusi kelas:")
        print(data[label_column].value_counts())
        
        return data
        
    except Exception as e:
        print(f"Error saat load CSV: {str(e)}")
        return None

### 1.2.2 Preview Data

In [13]:
def preview_data(data: pd.DataFrame, n_samples: int = 5, include_stats: bool = True) -> Dict[str, Any]:
   preview_info = {
       'dataset_shape': data.shape,
       'column_names': list(data.columns),
       'data_types': data.dtypes.to_dict(),
       'memory_usage': f"{data.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
       'head_samples': data.head(n_samples),
       'tail_samples': data.tail(n_samples),
       'missing_values': data.isnull().sum().to_dict(),
       'missing_percentage': (data.isnull().sum() / len(data) * 100).round(2).to_dict()
   }
   
   if include_stats:
       numeric_cols = data.select_dtypes(include=[np.number]).columns
       categorical_cols = data.select_dtypes(include=['object', 'category']).columns
       
       if len(numeric_cols) > 0:
           preview_info['numeric_statistics'] = data[numeric_cols].describe()
       
       if len(categorical_cols) > 0:
           preview_info['categorical_info'] = {}
           for col in categorical_cols:
               preview_info['categorical_info'][col] = {
                   'unique_count': data[col].nunique(),
                   'top_values': data[col].value_counts().head().to_dict()
               }
   
   return preview_info

In [15]:
def display_data_overview(data: pd.DataFrame, target_column: str = None):
   info = preview_data(data)
   
   print("=" * 60)
   print("DATASET OVERVIEW")
   print("=" * 60)
   print(f"Shape: {info['dataset_shape'][0]} rows × {info['dataset_shape'][1]} columns")
   print(f"Memory Usage: {info['memory_usage']}")
   print(f"Columns: {', '.join(info['column_names'])}")
   
   print("\n" + "=" * 60)
   print("DATA TYPES")
   print("=" * 60)
   for col, dtype in info['data_types'].items():
       print(f"{col}: {dtype}")
   
   print("\n" + "=" * 60)
   print("MISSING VALUES")
   print("=" * 60)
   for col, missing in info['missing_values'].items():
       percentage = info['missing_percentage'][col]
       print(f"{col}: {missing} ({percentage}%)")
   
   if target_column and target_column in data.columns:
       print(f"\n" + "=" * 60)
       print(f"TARGET VARIABLE: {target_column}")
       print("=" * 60)
       print(data[target_column].value_counts())
       print(f"\nClass Distribution:")
       print((data[target_column].value_counts() / len(data) * 100).round(2))
   
   print(f"\n" + "=" * 60)
   print("SAMPLE DATA (First 5 rows)")
   print("=" * 60)
   print(info['head_samples'])
   
   if 'categorical_info' in info:
       print(f"\n" + "=" * 60)
       print("CATEGORICAL COLUMNS SUMMARY")
       print("=" * 60)
       for col, cat_info in info['categorical_info'].items():
           print(f"\n{col}:")
           print(f"  Unique values: {cat_info['unique_count']}")
           print(f"  Top values: {cat_info['top_values']}")

In [17]:
display_data_overview(df, target_column='CLASS')

DATASET OVERVIEW
Shape: 1956 rows × 6 columns
Memory Usage: 1.01 MB
Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, VIDEO_NAME, CLASS

DATA TYPES
COMMENT_ID: object
AUTHOR: object
DATE: object
CONTENT: object
VIDEO_NAME: object
CLASS: int64

MISSING VALUES
COMMENT_ID: 0 (0.0%)
AUTHOR: 0 (0.0%)
DATE: 245 (12.53%)
CONTENT: 0 (0.0%)
VIDEO_NAME: 0 (0.0%)
CLASS: 0 (0.0%)

TARGET VARIABLE: CLASS
CLASS
1    1005
0     951
Name: count, dtype: int64

Class Distribution:
CLASS
1    51.38
0    48.62
Name: count, dtype: float64

SAMPLE DATA (First 5 rows)
                                    COMMENT_ID            AUTHOR  \
0  LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   
1  LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   
2  LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   
3          z13jhp0bxqncu512g22wvzkasxmvvzjaz04   ElNino Melendez   
4          z13fwbwp1oujthgqj04chlngpvzmtt3r3dw            GsMega   

                  DATE                       

In [7]:
df = load_csv_data('../data/Youtube-Spam-Dataset.csv',
                   text_column='CONTENT',
                   label_column='CLASS',
                   encoding='latin-1')

Data berhasil diload: 1956 sampel
Distribusi kelas:
CLASS
1    1005
0     951
Name: count, dtype: int64


### 1.2.2 Convert Lower Case

In [8]:
def convert_to_lowercase(data: Union[str, pd.DataFrame, pd.Series, List[str], np.ndarray], 
                       column: str = None) -> Union[str, pd.DataFrame, pd.Series, List[str], np.ndarray]:
   if isinstance(data, str):
       return data.lower()
   
   elif isinstance(data, pd.DataFrame):
       if column is None:
           raise ValueError("Parameter 'column' harus diisi untuk DataFrame")
       result = data.copy()
       result[column] = result[column].astype(str).str.lower()
       return result
   
   elif isinstance(data, pd.Series):
       return data.astype(str).str.lower()
   
   elif isinstance(data, list):
       return [str(text).lower() for text in data]
   
   elif isinstance(data, np.ndarray):
       return np.array([str(text).lower() for text in data])
   
   else:
       raise TypeError(f"Tipe data {type(data)} tidak didukung")


In [10]:
converted_df = convert_to_lowercase(df, column='CONTENT')
converted_df['CONTENT'].head()

0    huh, anyway check out this you[tube] channel: ...
1    hey guys check out my new channel and our firs...
2               just for test i have to say murdev.com
3    me shaking my sexy ass on my channel enjoy ^_^...
4            watch?v=vtarggvgtwq   check this out .ï»¿
Name: CONTENT, dtype: object

# 2. Data Cleaning