<a href="https://colab.research.google.com/github/AlbireoFinoe/data-mining/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [13]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "income"
]

In [14]:
df = pd.read_csv("adult.data", header=None, names=columns, na_values=" ?")

print("==== Data Awal ====")
print(df.head())
print(df.info())

==== Data Awal ====
   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   

In [15]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns.drop("income")

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print("\n==== Setelah Imputasi Missing Value ====")
print(df.isnull().sum())


==== Setelah Imputasi Missing Value ====
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [16]:
X = df.drop("income", axis=1)  # fitur
y = df["income"]               # label

# Label encoding untuk target
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)  # <=50K → 0, >50K → 1

# One-hot encoding untuk semua fitur kategori
ct = ColumnTransformer(
    transformers=[("encoder", OneHotEncoder(handle_unknown='ignore'), cat_cols)],
    remainder="passthrough"
)
X = ct.fit_transform(X)

print("\n==== Setelah Encoding ====")
print("Jumlah fitur hasil one-hot encoding:", X.shape[1])



==== Setelah Encoding ====
Jumlah fitur hasil one-hot encoding: 105


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\n==== Split Data ====")
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


==== Split Data ====
Train set: (26048, 105) (26048,)
Test set: (6513, 105) (6513,)


In [18]:
scaler = StandardScaler(with_mean=False)  # pakai with_mean=False karena hasil OneHot bisa sparse
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("\n==== Setelah Scaling ====")
print("X_train (scaled) contoh:\n", X_train[:5])


==== Setelah Scaling ====
X_train (scaled) contoh:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 61 stored elements and shape (5, 105)>
  Coords	Values
  (0, 1)	4.042097088299415
  (0, 17)	2.693081449078022
  (0, 28)	2.1291501886368143
  (0, 40)	2.5863729219381106
  (0, 46)	2.298792443301815
  (0, 55)	2.824641609759898
  (0, 56)	2.123634755013888
  (0, 96)	3.5314389523224907
  (0, 99)	2.418886319853158
  (0, 100)	1.883316250720019
  (0, 101)	5.0482441099275315
  (0, 104)	4.054853273097717
  (1, 3)	2.3128445615460205
  (1, 16)	4.953008719374762
  (1, 26)	2.0058978000714904
  (1, 34)	3.034937837426196
  (1, 45)	2.0354762818860186
  (1, 55)	2.824641609759898
  (1, 57)	2.123634755013937
  (1, 96)	3.5314389523224907
  (1, 99)	2.6387850762034453
  (1, 100)	0.8216125486091245
  (1, 101)	4.271591169938681
  (1, 103)	4.675165915581761
  (1, 104)	4.054853273097717
  :	:
  (2, 104)	3.2438826184781737
  (3, 3)	2.3128445615460205
  (3, 16)	4.953008719374762
  (3, 26)	2.00589780007

In [19]:
from sklearn.preprocessing import StandardScaler

# Setelah OneHotEncoding + Imputasi + Split dataset
# X_train, X_test sudah siap di-scaling

sc = StandardScaler(with_mean=False)
# with_mean=False karena hasil OneHotEncoder berupa sparse matrix

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

Training shape: (26048, 105)
Testing shape: (6513, 105)
