# Tugas Praktikum - Wisconsin Breast Cancer

**Deskripsi Tugas**:

Pada tugas pratikum ini Anda akan menggunakan data "Wisconsin Breast Cancer". Data tersebut terdiri dari 569 data yang digunakan untuk mendiagnonis jenis kanker Malignant (M) dan Benign (B). Tugas Anda adalah
1. Pisahkan antara variabel yang dapat digunakan dan variabel yang tidak dapat digunakan.
2. Lakukan proses encoding pada kolom "diagnosis".
3. Lakukan proses standardisasi pada semua kolom yang memiliki nilai numerik.

## Langkah 0 -Import & Load Data

In [2]:
# Cell A: Imports & load data
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.preprocessing import StandardScaler

drive.mount('/content/drive')

csv_path = "/content/drive/MyDrive/Machine Learning - 2025/Jobsheet-02/wbc.csv"
df = pd.read_csv(csv_path)

df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


## Langkah 1 - Memisahkan variabel yang dapat / tidak dapat digunakan

In [3]:
cols = df.columns.tolist()

# detect id-like and useless columns (unnamed all-NaN, constant columns)
id_like = [c for c in cols if c.lower()=="id" or c.lower().startswith("id") or c.lower().endswith("_id")]
unnamed_all_nan = [c for c in cols if c.lower().startswith("unnamed") and df[c].isna().all()]
constant_cols = [c for c in cols if df[c].nunique(dropna=False) <= 1]

non_usable = list(dict.fromkeys(id_like + unnamed_all_nan + constant_cols))
non_usable = [c for c in non_usable if c.lower() != "diagnosis"]

potential_features = [c for c in cols if c not in non_usable and c.lower() != "diagnosis"]
usable_numeric = [c for c in potential_features if pd.api.types.is_numeric_dtype(df[c])]

print("Non-usable columns:", non_usable)
print("Potential feature columns (excl. diagnosis):", potential_features)
print("Usable numeric feature columns:", usable_numeric)


Non-usable columns: ['id', 'Unnamed: 32']
Potential feature columns (excl. diagnosis): ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
Usable numeric feature columns: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal

## Langkah 2 - Encode kolom diagnosis (M→1, B→0)

In [4]:
# Jika kolom bernama 'Diagnosis' ubah namanya terlebih dahulu
if 'diagnosis' not in df.columns and 'Diagnosis' in df.columns:
    df.rename(columns={'Diagnosis':'diagnosis'}, inplace=True)

if 'diagnosis' in df.columns:
    mapping = {'M':1, 'B':0}
    # jika dataset berisi 'malignant'/'benign' mapping_alt bisa dipakai
    mapping_alt = {'malignant':1, 'benign':0, 'Malignant':1, 'Benign':0}
    if set(df['diagnosis'].astype(str).unique()).issubset(set(mapping.keys())):
        df['diagnosis_encoded'] = df['diagnosis'].map(mapping)
    elif set(df['diagnosis'].astype(str).str.lower().unique()).issubset(set([k.lower() for k in mapping_alt.keys()])):
        df['diagnosis_encoded'] = df['diagnosis'].map(lambda x: mapping_alt.get(x, np.nan))
    else:
        # fallback: label encoding otomatis
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        df['diagnosis_encoded'] = le.fit_transform(df['diagnosis'].astype(str))
        print("LabelEncoder classes:", list(le.classes_))

    print(df['diagnosis_encoded'].value_counts())
else:
    raise ValueError("Kolom 'diagnosis' tidak ditemukan.")


diagnosis_encoded
0    357
1    212
Name: count, dtype: int64


## Langkah -3 Standardisasi semua kolom numerik fitur

In [9]:
scaler = StandardScaler()
if len(usable_numeric) == 0:
    print("Tidak ada kolom numerik untuk distandarisasi.")
else:
    df_scaled = df.copy()
    df_scaled[usable_numeric] = scaler.fit_transform(df[usable_numeric])

    # verifikasi mean/std mendekati 0/1
    verify = pd.DataFrame({'mean': df_scaled[usable_numeric].mean().round(6),
                           'std' : df_scaled[usable_numeric].std().round(6)})
    display(verify.head(12))


Unnamed: 0,mean,std
radius_mean,-0.0,1.00088
texture_mean,0.0,1.00088
perimeter_mean,-0.0,1.00088
area_mean,-0.0,1.00088
smoothness_mean,-0.0,1.00088
compactness_mean,0.0,1.00088
concavity_mean,0.0,1.00088
concave points_mean,-0.0,1.00088
symmetry_mean,0.0,1.00088
fractal_dimension_mean,0.0,1.00088
