<a href="https://colab.research.google.com/github/221230003-coder/221230003-pengantar-ML/blob/main/221230003_Pengantar_ML_week_02_latihan_praktikum_2_numpy_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np

# TODO 1: Normalisasi Z-score
def z_score_normalization(data):
    """
    Melakukan normalisasi Z-score pada data.
    Rumus: (x - mean) / std
    Input: np.ndarray
    Output: np.ndarray dengan mean ~0 dan std ~1
    """
    if not isinstance(data, np.ndarray):
        raise TypeError("Input harus berupa NumPy array")

    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)

    # Hindari pembagian dengan nol
    std[std == 0] = 1

    return (data - mean) / std


# TODO 2: Handle outliers (clip values beyond 3 std)
def handle_outliers(data, std_threshold=3):
    """
    Menangani outlier dengan cara clipping.
    Nilai di luar mean ± threshold*std dipotong ke batas tersebut.
    Input: np.ndarray
    Output: np.ndarray tanpa outlier ekstrem
    """
    if not isinstance(data, np.ndarray):
        raise TypeError("Input harus berupa NumPy array")

    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)

    lower_bound = mean - std_threshold * std
    upper_bound = mean + std_threshold * std

    return np.clip(data, lower_bound, upper_bound)


# TODO 3: One-hot encoding untuk label kategorikal
def one_hot_encoding(labels):
    """
    Konversi label kategorikal ke one-hot encoding.
    Input: array 1D (contoh: [0,1,2,0])
    Output: array 2D one-hot
    """
    if not isinstance(labels, np.ndarray):
        raise TypeError("Labels harus berupa NumPy array")
    if labels.ndim != 1:
        raise ValueError("Labels harus berupa array 1D")

    n_classes = np.max(labels) + 1
    one_hot = np.zeros((labels.shape[0], n_classes), dtype=int)
    one_hot[np.arange(labels.shape[0]), labels] = 1

    return one_hot


# TODO 4: Train-test split manual
def train_test_split_numpy(X, y, test_size=0.2, random_state=None):
    """
    Split dataset menjadi train dan test secara manual tanpa sklearn.
    Input:
      - X: fitur (np.ndarray)
      - y: label (np.ndarray)
      - test_size: proporsi test (float, default=0.2)
      - random_state: untuk reproducibility
    Output: X_train, X_test, y_train, y_test
    """
    if random_state is not None:
        np.random.seed(random_state)

    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    test_size = int(n_samples * test_size)

    test_idx = indices[:test_size]
    train_idx = indices[test_size:]

    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Dataset simulasi
np.random.seed(42)
X = np.random.randn(100, 5) * 10 + 5  # Mean=5, Std=10

# Normalisasi
X_normalized = z_score_normalization(X)

# Outlier Handling
X_cleaned = handle_outliers(X_normalized)

# One-hot Encoding
labels = np.array([0, 1, 2, 0, 1, 2, 0])
one_hot_labels = one_hot_encoding(labels)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split_numpy(X, np.random.randint(0, 3, 100), random_state=42)

# Validasi
assert X_normalized.shape == X.shape, "Shape harus sama"
assert np.allclose(X_normalized.mean(), 0, atol=1e-10), "Mean ~0 setelah normalisasi"
assert np.allclose(X_normalized.std(), 1, atol=1e-10), "Std ~1 setelah normalisasi"

print("NumPy operations completed")
print("X_normalized:\n", X_normalized[:5])
print("X_cleaned:\n", X_cleaned[:5])
print("One-hot labels:\n", one_hot_labels)



NumPy operations completed
X_normalized:
 [[ 0.604418   -0.21979528  0.75746006  1.461092   -0.18919425]
 [-0.21141045  1.53420502  0.87977344 -0.62585547  0.53845484]
 [-0.46735006 -0.55422449  0.34303561 -2.13809665 -1.58578836]
 [-0.57771567 -1.1129603   0.41687036 -1.08519209 -1.2929218 ]
 [ 1.68601234 -0.30916828  0.16486193 -1.62640854 -0.47982667]]
X_cleaned:
 [[ 0.604418   -0.21979528  0.75746006  1.461092   -0.18919425]
 [-0.21141045  1.53420502  0.87977344 -0.62585547  0.53845484]
 [-0.46735006 -0.55422449  0.34303561 -2.13809665 -1.58578836]
 [-0.57771567 -1.1129603   0.41687036 -1.08519209 -1.2929218 ]
 [ 1.68601234 -0.30916828  0.16486193 -1.62640854 -0.47982667]]
One-hot labels:
 [[1 0 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]]
